i965: Add Gen assertion checks for newer instructions.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_generator.cpp
1 /* Copyright © 2011 Intel Corporation
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice (including the next
11 * paragraph) shall be included in all copies or substantial portions of the
12 * Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22
23 #include "brw_vec4.h"
24
25 extern "C" {
26 #include "brw_eu.h"
27 #include "main/macros.h"
28 #include "program/prog_print.h"
29 #include "program/prog_parameter.h"
30 };
31
32 namespace brw {
33
34 struct brw_reg
35 vec4_instruction::get_dst(void)
36 {
37 struct brw_reg brw_reg;
38
39 switch (dst.file) {
40 case GRF:
41 brw_reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
42 brw_reg = retype(brw_reg, dst.type);
43 brw_reg.dw1.bits.writemask = dst.writemask;
44 break;
45
46 case MRF:
47 brw_reg = brw_message_reg(dst.reg + dst.reg_offset);
48 brw_reg = retype(brw_reg, dst.type);
49 brw_reg.dw1.bits.writemask = dst.writemask;
50 break;
51
52 case HW_REG:
53 brw_reg = dst.fixed_hw_reg;
54 break;
55
56 case BAD_FILE:
57 brw_reg = brw_null_reg();
58 break;
59
60 default:
61 assert(!"not reached");
62 brw_reg = brw_null_reg();
63 break;
64 }
65 return brw_reg;
66 }
67
68 struct brw_reg
69 vec4_instruction::get_src(const struct brw_vec4_prog_data *prog_data, int i)
70 {
71 struct brw_reg brw_reg;
72
73 switch (src[i].file) {
74 case GRF:
75 brw_reg = brw_vec8_grf(src[i].reg + src[i].reg_offset, 0);
76 brw_reg = retype(brw_reg, src[i].type);
77 brw_reg.dw1.bits.swizzle = src[i].swizzle;
78 if (src[i].abs)
79 brw_reg = brw_abs(brw_reg);
80 if (src[i].negate)
81 brw_reg = negate(brw_reg);
82 break;
83
84 case IMM:
85 switch (src[i].type) {
86 case BRW_REGISTER_TYPE_F:
87 brw_reg = brw_imm_f(src[i].imm.f);
88 break;
89 case BRW_REGISTER_TYPE_D:
90 brw_reg = brw_imm_d(src[i].imm.i);
91 break;
92 case BRW_REGISTER_TYPE_UD:
93 brw_reg = brw_imm_ud(src[i].imm.u);
94 break;
95 default:
96 assert(!"not reached");
97 brw_reg = brw_null_reg();
98 break;
99 }
100 break;
101
102 case UNIFORM:
103 brw_reg = stride(brw_vec4_grf(prog_data->dispatch_grf_start_reg +
104 (src[i].reg + src[i].reg_offset) / 2,
105 ((src[i].reg + src[i].reg_offset) % 2) * 4),
106 0, 4, 1);
107 brw_reg = retype(brw_reg, src[i].type);
108 brw_reg.dw1.bits.swizzle = src[i].swizzle;
109 if (src[i].abs)
110 brw_reg = brw_abs(brw_reg);
111 if (src[i].negate)
112 brw_reg = negate(brw_reg);
113
114 /* This should have been moved to pull constants. */
115 assert(!src[i].reladdr);
116 break;
117
118 case HW_REG:
119 brw_reg = src[i].fixed_hw_reg;
120 break;
121
122 case BAD_FILE:
123 /* Probably unused. */
124 brw_reg = brw_null_reg();
125 break;
126 case ATTR:
127 default:
128 assert(!"not reached");
129 brw_reg = brw_null_reg();
130 break;
131 }
132
133 return brw_reg;
134 }
135
136 vec4_generator::vec4_generator(struct brw_context *brw,
137 struct gl_shader_program *shader_prog,
138 struct gl_program *prog,
139 struct brw_vec4_prog_data *prog_data,
140 void *mem_ctx,
141 bool debug_flag)
142 : brw(brw), shader_prog(shader_prog), prog(prog), prog_data(prog_data),
143 mem_ctx(mem_ctx), debug_flag(debug_flag)
144 {
145 shader = shader_prog ? shader_prog->_LinkedShaders[MESA_SHADER_VERTEX] : NULL;
146
147 p = rzalloc(mem_ctx, struct brw_compile);
148 brw_init_compile(brw, p, mem_ctx);
149 }
150
151 vec4_generator::~vec4_generator()
152 {
153 }
154
155 void
156 vec4_generator::mark_surface_used(unsigned surf_index)
157 {
158 assert(surf_index < BRW_MAX_VEC4_SURFACES);
159
160 prog_data->binding_table_size = MAX2(prog_data->binding_table_size,
161 surf_index + 1);
162 }
163
164 void
165 vec4_generator::generate_math1_gen4(vec4_instruction *inst,
166 struct brw_reg dst,
167 struct brw_reg src)
168 {
169 brw_math(p,
170 dst,
171 brw_math_function(inst->opcode),
172 inst->base_mrf,
173 src,
174 BRW_MATH_DATA_VECTOR,
175 BRW_MATH_PRECISION_FULL);
176 }
177
178 static void
179 check_gen6_math_src_arg(struct brw_reg src)
180 {
181 /* Source swizzles are ignored. */
182 assert(!src.abs);
183 assert(!src.negate);
184 assert(src.dw1.bits.swizzle == BRW_SWIZZLE_XYZW);
185 }
186
187 void
188 vec4_generator::generate_math1_gen6(vec4_instruction *inst,
189 struct brw_reg dst,
190 struct brw_reg src)
191 {
192 /* Can't do writemask because math can't be align16. */
193 assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
194 check_gen6_math_src_arg(src);
195
196 brw_set_access_mode(p, BRW_ALIGN_1);
197 brw_math(p,
198 dst,
199 brw_math_function(inst->opcode),
200 inst->base_mrf,
201 src,
202 BRW_MATH_DATA_SCALAR,
203 BRW_MATH_PRECISION_FULL);
204 brw_set_access_mode(p, BRW_ALIGN_16);
205 }
206
207 void
208 vec4_generator::generate_math2_gen7(vec4_instruction *inst,
209 struct brw_reg dst,
210 struct brw_reg src0,
211 struct brw_reg src1)
212 {
213 brw_math2(p,
214 dst,
215 brw_math_function(inst->opcode),
216 src0, src1);
217 }
218
219 void
220 vec4_generator::generate_math2_gen6(vec4_instruction *inst,
221 struct brw_reg dst,
222 struct brw_reg src0,
223 struct brw_reg src1)
224 {
225 /* Can't do writemask because math can't be align16. */
226 assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
227 /* Source swizzles are ignored. */
228 check_gen6_math_src_arg(src0);
229 check_gen6_math_src_arg(src1);
230
231 brw_set_access_mode(p, BRW_ALIGN_1);
232 brw_math2(p,
233 dst,
234 brw_math_function(inst->opcode),
235 src0, src1);
236 brw_set_access_mode(p, BRW_ALIGN_16);
237 }
238
239 void
240 vec4_generator::generate_math2_gen4(vec4_instruction *inst,
241 struct brw_reg dst,
242 struct brw_reg src0,
243 struct brw_reg src1)
244 {
245 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
246 * "Message Payload":
247 *
248 * "Operand0[7]. For the INT DIV functions, this operand is the
249 * denominator."
250 * ...
251 * "Operand1[7]. For the INT DIV functions, this operand is the
252 * numerator."
253 */
254 bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
255 struct brw_reg &op0 = is_int_div ? src1 : src0;
256 struct brw_reg &op1 = is_int_div ? src0 : src1;
257
258 brw_push_insn_state(p);
259 brw_set_saturate(p, false);
260 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
261 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
262 brw_pop_insn_state(p);
263
264 brw_math(p,
265 dst,
266 brw_math_function(inst->opcode),
267 inst->base_mrf,
268 op0,
269 BRW_MATH_DATA_VECTOR,
270 BRW_MATH_PRECISION_FULL);
271 }
272
273 void
274 vec4_generator::generate_tex(vec4_instruction *inst,
275 struct brw_reg dst,
276 struct brw_reg src)
277 {
278 int msg_type = -1;
279
280 if (brw->gen >= 5) {
281 switch (inst->opcode) {
282 case SHADER_OPCODE_TEX:
283 case SHADER_OPCODE_TXL:
284 if (inst->shadow_compare) {
285 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
286 } else {
287 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
288 }
289 break;
290 case SHADER_OPCODE_TXD:
291 if (inst->shadow_compare) {
292 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
293 assert(brw->is_haswell);
294 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
295 } else {
296 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
297 }
298 break;
299 case SHADER_OPCODE_TXF:
300 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
301 break;
302 case SHADER_OPCODE_TXF_MS:
303 if (brw->gen >= 7)
304 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
305 else
306 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
307 break;
308 case SHADER_OPCODE_TXS:
309 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
310 break;
311 case SHADER_OPCODE_TG4:
312 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
313 break;
314 default:
315 assert(!"should not get here: invalid VS texture opcode");
316 break;
317 }
318 } else {
319 switch (inst->opcode) {
320 case SHADER_OPCODE_TEX:
321 case SHADER_OPCODE_TXL:
322 if (inst->shadow_compare) {
323 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
324 assert(inst->mlen == 3);
325 } else {
326 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
327 assert(inst->mlen == 2);
328 }
329 break;
330 case SHADER_OPCODE_TXD:
331 /* There is no sample_d_c message; comparisons are done manually. */
332 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
333 assert(inst->mlen == 4);
334 break;
335 case SHADER_OPCODE_TXF:
336 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
337 assert(inst->mlen == 2);
338 break;
339 case SHADER_OPCODE_TXS:
340 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
341 assert(inst->mlen == 2);
342 break;
343 default:
344 assert(!"should not get here: invalid VS texture opcode");
345 break;
346 }
347 }
348
349 assert(msg_type != -1);
350
351 /* Load the message header if present. If there's a texture offset, we need
352 * to set it up explicitly and load the offset bitfield. Otherwise, we can
353 * use an implied move from g0 to the first message register.
354 */
355 if (inst->texture_offset) {
356 /* Explicitly set up the message header by copying g0 to the MRF. */
357 brw_push_insn_state(p);
358 brw_set_mask_control(p, BRW_MASK_DISABLE);
359 brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
360 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
361
362 /* Then set the offset bits in DWord 2. */
363 brw_set_access_mode(p, BRW_ALIGN_1);
364 brw_MOV(p,
365 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, inst->base_mrf, 2),
366 BRW_REGISTER_TYPE_UD),
367 brw_imm_ud(inst->texture_offset));
368 brw_pop_insn_state(p);
369 } else if (inst->header_present) {
370 /* Set up an implied move from g0 to the MRF. */
371 src = brw_vec8_grf(0, 0);
372 }
373
374 uint32_t return_format;
375
376 switch (dst.type) {
377 case BRW_REGISTER_TYPE_D:
378 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
379 break;
380 case BRW_REGISTER_TYPE_UD:
381 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
382 break;
383 default:
384 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
385 break;
386 }
387
388 uint32_t surface_index = inst->opcode == SHADER_OPCODE_TG4
389 ? SURF_INDEX_VEC4_GATHER_TEXTURE(inst->sampler)
390 : SURF_INDEX_VEC4_TEXTURE(inst->sampler);
391
392 brw_SAMPLE(p,
393 dst,
394 inst->base_mrf,
395 src,
396 surface_index,
397 inst->sampler,
398 msg_type,
399 1, /* response length */
400 inst->mlen,
401 inst->header_present,
402 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
403 return_format);
404
405 mark_surface_used(surface_index);
406 }
407
408 void
409 vec4_generator::generate_vs_urb_write(vec4_instruction *inst)
410 {
411 brw_urb_WRITE(p,
412 brw_null_reg(), /* dest */
413 inst->base_mrf, /* starting mrf reg nr */
414 brw_vec8_grf(0, 0), /* src */
415 inst->urb_write_flags,
416 inst->mlen,
417 0, /* response len */
418 inst->offset, /* urb destination offset */
419 BRW_URB_SWIZZLE_INTERLEAVE);
420 }
421
422 void
423 vec4_generator::generate_gs_urb_write(vec4_instruction *inst)
424 {
425 struct brw_reg src = brw_message_reg(inst->base_mrf);
426 brw_urb_WRITE(p,
427 brw_null_reg(), /* dest */
428 inst->base_mrf, /* starting mrf reg nr */
429 src,
430 inst->urb_write_flags,
431 inst->mlen,
432 0, /* response len */
433 inst->offset, /* urb destination offset */
434 BRW_URB_SWIZZLE_INTERLEAVE);
435 }
436
437 void
438 vec4_generator::generate_gs_thread_end(vec4_instruction *inst)
439 {
440 struct brw_reg src = brw_message_reg(inst->base_mrf);
441 brw_urb_WRITE(p,
442 brw_null_reg(), /* dest */
443 inst->base_mrf, /* starting mrf reg nr */
444 src,
445 BRW_URB_WRITE_EOT,
446 1, /* message len */
447 0, /* response len */
448 0, /* urb destination offset */
449 BRW_URB_SWIZZLE_INTERLEAVE);
450 }
451
452 void
453 vec4_generator::generate_gs_set_write_offset(struct brw_reg dst,
454 struct brw_reg src0,
455 struct brw_reg src1)
456 {
457 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
458 * Header: M0.3):
459 *
460 * Slot 0 Offset. This field, after adding to the Global Offset field
461 * in the message descriptor, specifies the offset (in 256-bit units)
462 * from the start of the URB entry, as referenced by URB Handle 0, at
463 * which the data will be accessed.
464 *
465 * Similar text describes DWORD M0.4, which is slot 1 offset.
466 *
467 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
468 * of the register for geometry shader invocations 0 and 1) by the
469 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
470 *
471 * We can do this with the following EU instruction:
472 *
473 * mul(2) dst.3<1>UD src0<8;2,4>UD src1 { Align1 WE_all }
474 */
475 brw_push_insn_state(p);
476 brw_set_access_mode(p, BRW_ALIGN_1);
477 brw_set_mask_control(p, BRW_MASK_DISABLE);
478 brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
479 src1);
480 brw_set_access_mode(p, BRW_ALIGN_16);
481 brw_pop_insn_state(p);
482 }
483
484 void
485 vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst,
486 struct brw_reg src)
487 {
488 brw_push_insn_state(p);
489 brw_set_access_mode(p, BRW_ALIGN_1);
490 brw_set_mask_control(p, BRW_MASK_DISABLE);
491
492 /* If we think of the src and dst registers as composed of 8 DWORDs each,
493 * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
494 * them to WORDs, and then pack them into DWORD 2 of dst.
495 *
496 * It's easier to get the EU to do this if we think of the src and dst
497 * registers as composed of 16 WORDS each; then, we want to pick up the
498 * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 of
499 * dst.
500 *
501 * We can do that by the following EU instruction:
502 *
503 * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask }
504 */
505 brw_MOV(p, suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
506 stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
507 brw_set_access_mode(p, BRW_ALIGN_16);
508 brw_pop_insn_state(p);
509 }
510
511 void
512 vec4_generator::generate_gs_set_dword_2_immed(struct brw_reg dst,
513 struct brw_reg src)
514 {
515 assert(src.file == BRW_IMMEDIATE_VALUE);
516
517 brw_push_insn_state(p);
518 brw_set_access_mode(p, BRW_ALIGN_1);
519 brw_set_mask_control(p, BRW_MASK_DISABLE);
520 brw_MOV(p, suboffset(vec1(dst), 2), src);
521 brw_set_access_mode(p, BRW_ALIGN_16);
522 brw_pop_insn_state(p);
523 }
524
525 void
526 vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst)
527 {
528 /* We want to left shift just DWORD 4 (the x component belonging to the
529 * second geometry shader invocation) by 4 bits. So generate the
530 * instruction:
531 *
532 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
533 */
534 dst = suboffset(vec1(dst), 4);
535 brw_push_insn_state(p);
536 brw_set_access_mode(p, BRW_ALIGN_1);
537 brw_set_mask_control(p, BRW_MASK_DISABLE);
538 brw_SHL(p, dst, dst, brw_imm_ud(4));
539 brw_pop_insn_state(p);
540 }
541
542 void
543 vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst,
544 struct brw_reg src)
545 {
546 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
547 * Header: M0.5):
548 *
549 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
550 *
551 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
552 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
553 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding
554 * channel enable to determine the final channel enable. For the
555 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel
556 * enable is 1 it indicates that Vertex 1 DATA [3] will be included
557 * in the writeback message. For the URB_WRITE_OWORD &
558 * URB_WRITE_HWORD messages, when final channel enable is 1 it
559 * indicates that Vertex 1 DATA [3] will be written to the surface.
560 *
561 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
562 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
563 *
564 * 14 Vertex 1 DATA [2] Channel Mask
565 * 13 Vertex 1 DATA [1] Channel Mask
566 * 12 Vertex 1 DATA [0] Channel Mask
567 * 11 Vertex 0 DATA [3] Channel Mask
568 * 10 Vertex 0 DATA [2] Channel Mask
569 * 9 Vertex 0 DATA [1] Channel Mask
570 * 8 Vertex 0 DATA [0] Channel Mask
571 *
572 * (This is from a section of the PRM that is agnostic to the particular
573 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
574 * geometry shader invocations 0 and 1, respectively). Since we have the
575 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
576 * and the enable flags for geometry shader invocation 1 in bits 7:0 of
577 * DWORD 4, we just need to OR them together and store the result in bits
578 * 15:8 of DWORD 5.
579 *
580 * It's easier to get the EU to do this if we think of the src and dst
581 * registers as composed of 32 bytes each; then, we want to pick up the
582 * contents of bytes 0 and 16 from src, OR them together, and store them in
583 * byte 21.
584 *
585 * We can do that by the following EU instruction:
586 *
587 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
588 *
589 * Note: this relies on the source register having zeros in (a) bits 7:4 of
590 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the
591 * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
592 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
593 * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
594 * contain valid channel mask values (which are in the range 0x0-0xf).
595 */
596 dst = retype(dst, BRW_REGISTER_TYPE_UB);
597 src = retype(src, BRW_REGISTER_TYPE_UB);
598 brw_push_insn_state(p);
599 brw_set_access_mode(p, BRW_ALIGN_1);
600 brw_set_mask_control(p, BRW_MASK_DISABLE);
601 brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
602 brw_pop_insn_state(p);
603 }
604
605 void
606 vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
607 struct brw_reg index)
608 {
609 int second_vertex_offset;
610
611 if (brw->gen >= 6)
612 second_vertex_offset = 1;
613 else
614 second_vertex_offset = 16;
615
616 m1 = retype(m1, BRW_REGISTER_TYPE_D);
617
618 /* Set up M1 (message payload). Only the block offsets in M1.0 and
619 * M1.4 are used, and the rest are ignored.
620 */
621 struct brw_reg m1_0 = suboffset(vec1(m1), 0);
622 struct brw_reg m1_4 = suboffset(vec1(m1), 4);
623 struct brw_reg index_0 = suboffset(vec1(index), 0);
624 struct brw_reg index_4 = suboffset(vec1(index), 4);
625
626 brw_push_insn_state(p);
627 brw_set_mask_control(p, BRW_MASK_DISABLE);
628 brw_set_access_mode(p, BRW_ALIGN_1);
629
630 brw_MOV(p, m1_0, index_0);
631
632 if (index.file == BRW_IMMEDIATE_VALUE) {
633 index_4.dw1.ud += second_vertex_offset;
634 brw_MOV(p, m1_4, index_4);
635 } else {
636 brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
637 }
638
639 brw_pop_insn_state(p);
640 }
641
642 void
643 vec4_generator::generate_unpack_flags(vec4_instruction *inst,
644 struct brw_reg dst)
645 {
646 brw_push_insn_state(p);
647 brw_set_mask_control(p, BRW_MASK_DISABLE);
648 brw_set_access_mode(p, BRW_ALIGN_1);
649
650 struct brw_reg flags = brw_flag_reg(0, 0);
651 struct brw_reg dst_0 = suboffset(vec1(dst), 0);
652 struct brw_reg dst_4 = suboffset(vec1(dst), 4);
653
654 brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
655 brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
656 brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
657
658 brw_pop_insn_state(p);
659 }
660
661 void
662 vec4_generator::generate_scratch_read(vec4_instruction *inst,
663 struct brw_reg dst,
664 struct brw_reg index)
665 {
666 struct brw_reg header = brw_vec8_grf(0, 0);
667
668 gen6_resolve_implied_move(p, &header, inst->base_mrf);
669
670 generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
671 index);
672
673 uint32_t msg_type;
674
675 if (brw->gen >= 6)
676 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
677 else if (brw->gen == 5 || brw->is_g4x)
678 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
679 else
680 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
681
682 /* Each of the 8 channel enables is considered for whether each
683 * dword is written.
684 */
685 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
686 brw_set_dest(p, send, dst);
687 brw_set_src0(p, send, header);
688 if (brw->gen < 6)
689 send->header.destreg__conditionalmod = inst->base_mrf;
690 brw_set_dp_read_message(p, send,
691 255, /* binding table index: stateless access */
692 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
693 msg_type,
694 BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
695 2, /* mlen */
696 true, /* header_present */
697 1 /* rlen */);
698 }
699
700 void
701 vec4_generator::generate_scratch_write(vec4_instruction *inst,
702 struct brw_reg dst,
703 struct brw_reg src,
704 struct brw_reg index)
705 {
706 struct brw_reg header = brw_vec8_grf(0, 0);
707 bool write_commit;
708
709 /* If the instruction is predicated, we'll predicate the send, not
710 * the header setup.
711 */
712 brw_set_predicate_control(p, false);
713
714 gen6_resolve_implied_move(p, &header, inst->base_mrf);
715
716 generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
717 index);
718
719 brw_MOV(p,
720 retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
721 retype(src, BRW_REGISTER_TYPE_D));
722
723 uint32_t msg_type;
724
725 if (brw->gen >= 7)
726 msg_type = GEN7_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
727 else if (brw->gen == 6)
728 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
729 else
730 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
731
732 brw_set_predicate_control(p, inst->predicate);
733
734 /* Pre-gen6, we have to specify write commits to ensure ordering
735 * between reads and writes within a thread. Afterwards, that's
736 * guaranteed and write commits only matter for inter-thread
737 * synchronization.
738 */
739 if (brw->gen >= 6) {
740 write_commit = false;
741 } else {
742 /* The visitor set up our destination register to be g0. This
743 * means that when the next read comes along, we will end up
744 * reading from g0 and causing a block on the write commit. For
745 * write-after-read, we are relying on the value of the previous
746 * read being used (and thus blocking on completion) before our
747 * write is executed. This means we have to be careful in
748 * instruction scheduling to not violate this assumption.
749 */
750 write_commit = true;
751 }
752
753 /* Each of the 8 channel enables is considered for whether each
754 * dword is written.
755 */
756 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
757 brw_set_dest(p, send, dst);
758 brw_set_src0(p, send, header);
759 if (brw->gen < 6)
760 send->header.destreg__conditionalmod = inst->base_mrf;
761 brw_set_dp_write_message(p, send,
762 255, /* binding table index: stateless access */
763 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
764 msg_type,
765 3, /* mlen */
766 true, /* header present */
767 false, /* not a render target write */
768 write_commit, /* rlen */
769 false, /* eot */
770 write_commit);
771 }
772
773 void
774 vec4_generator::generate_pull_constant_load(vec4_instruction *inst,
775 struct brw_reg dst,
776 struct brw_reg index,
777 struct brw_reg offset)
778 {
779 assert(brw->gen <= 7);
780 assert(index.file == BRW_IMMEDIATE_VALUE &&
781 index.type == BRW_REGISTER_TYPE_UD);
782 uint32_t surf_index = index.dw1.ud;
783
784 struct brw_reg header = brw_vec8_grf(0, 0);
785
786 gen6_resolve_implied_move(p, &header, inst->base_mrf);
787
788 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_D),
789 offset);
790
791 uint32_t msg_type;
792
793 if (brw->gen >= 6)
794 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
795 else if (brw->gen == 5 || brw->is_g4x)
796 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
797 else
798 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
799
800 /* Each of the 8 channel enables is considered for whether each
801 * dword is written.
802 */
803 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
804 brw_set_dest(p, send, dst);
805 brw_set_src0(p, send, header);
806 if (brw->gen < 6)
807 send->header.destreg__conditionalmod = inst->base_mrf;
808 brw_set_dp_read_message(p, send,
809 surf_index,
810 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
811 msg_type,
812 BRW_DATAPORT_READ_TARGET_DATA_CACHE,
813 2, /* mlen */
814 true, /* header_present */
815 1 /* rlen */);
816
817 mark_surface_used(surf_index);
818 }
819
820 void
821 vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
822 struct brw_reg dst,
823 struct brw_reg surf_index,
824 struct brw_reg offset)
825 {
826 assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
827 surf_index.type == BRW_REGISTER_TYPE_UD);
828
829 brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
830 brw_set_dest(p, insn, dst);
831 brw_set_src0(p, insn, offset);
832 brw_set_sampler_message(p, insn,
833 surf_index.dw1.ud,
834 0, /* LD message ignores sampler unit */
835 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
836 1, /* rlen */
837 1, /* mlen */
838 false, /* no header */
839 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
840 0);
841
842 mark_surface_used(surf_index.dw1.ud);
843 }
844
845 /**
846 * Generate assembly for a Vec4 IR instruction.
847 *
848 * \param instruction The Vec4 IR instruction to generate code for.
849 * \param dst The destination register.
850 * \param src An array of up to three source registers.
851 */
852 void
853 vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
854 struct brw_reg dst,
855 struct brw_reg *src)
856 {
857 vec4_instruction *inst = (vec4_instruction *) instruction;
858
859 switch (inst->opcode) {
860 case BRW_OPCODE_MOV:
861 brw_MOV(p, dst, src[0]);
862 break;
863 case BRW_OPCODE_ADD:
864 brw_ADD(p, dst, src[0], src[1]);
865 break;
866 case BRW_OPCODE_MUL:
867 brw_MUL(p, dst, src[0], src[1]);
868 break;
869 case BRW_OPCODE_MACH:
870 brw_set_acc_write_control(p, 1);
871 brw_MACH(p, dst, src[0], src[1]);
872 brw_set_acc_write_control(p, 0);
873 break;
874
875 case BRW_OPCODE_MAD:
876 assert(brw->gen >= 6);
877 brw_MAD(p, dst, src[0], src[1], src[2]);
878 break;
879
880 case BRW_OPCODE_FRC:
881 brw_FRC(p, dst, src[0]);
882 break;
883 case BRW_OPCODE_RNDD:
884 brw_RNDD(p, dst, src[0]);
885 break;
886 case BRW_OPCODE_RNDE:
887 brw_RNDE(p, dst, src[0]);
888 break;
889 case BRW_OPCODE_RNDZ:
890 brw_RNDZ(p, dst, src[0]);
891 break;
892
893 case BRW_OPCODE_AND:
894 brw_AND(p, dst, src[0], src[1]);
895 break;
896 case BRW_OPCODE_OR:
897 brw_OR(p, dst, src[0], src[1]);
898 break;
899 case BRW_OPCODE_XOR:
900 brw_XOR(p, dst, src[0], src[1]);
901 break;
902 case BRW_OPCODE_NOT:
903 brw_NOT(p, dst, src[0]);
904 break;
905 case BRW_OPCODE_ASR:
906 brw_ASR(p, dst, src[0], src[1]);
907 break;
908 case BRW_OPCODE_SHR:
909 brw_SHR(p, dst, src[0], src[1]);
910 break;
911 case BRW_OPCODE_SHL:
912 brw_SHL(p, dst, src[0], src[1]);
913 break;
914
915 case BRW_OPCODE_CMP:
916 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
917 break;
918 case BRW_OPCODE_SEL:
919 brw_SEL(p, dst, src[0], src[1]);
920 break;
921
922 case BRW_OPCODE_DPH:
923 brw_DPH(p, dst, src[0], src[1]);
924 break;
925
926 case BRW_OPCODE_DP4:
927 brw_DP4(p, dst, src[0], src[1]);
928 break;
929
930 case BRW_OPCODE_DP3:
931 brw_DP3(p, dst, src[0], src[1]);
932 break;
933
934 case BRW_OPCODE_DP2:
935 brw_DP2(p, dst, src[0], src[1]);
936 break;
937
938 case BRW_OPCODE_F32TO16:
939 assert(brw->gen >= 7);
940 brw_F32TO16(p, dst, src[0]);
941 break;
942
943 case BRW_OPCODE_F16TO32:
944 assert(brw->gen >= 7);
945 brw_F16TO32(p, dst, src[0]);
946 break;
947
948 case BRW_OPCODE_LRP:
949 assert(brw->gen >= 6);
950 brw_LRP(p, dst, src[0], src[1], src[2]);
951 break;
952
953 case BRW_OPCODE_BFREV:
954 assert(brw->gen >= 7);
955 /* BFREV only supports UD type for src and dst. */
956 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
957 retype(src[0], BRW_REGISTER_TYPE_UD));
958 break;
959 case BRW_OPCODE_FBH:
960 assert(brw->gen >= 7);
961 /* FBH only supports UD type for dst. */
962 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
963 break;
964 case BRW_OPCODE_FBL:
965 assert(brw->gen >= 7);
966 /* FBL only supports UD type for dst. */
967 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
968 break;
969 case BRW_OPCODE_CBIT:
970 assert(brw->gen >= 7);
971 /* CBIT only supports UD type for dst. */
972 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
973 break;
974 case BRW_OPCODE_ADDC:
975 assert(brw->gen >= 7);
976 brw_set_acc_write_control(p, 1);
977 brw_ADDC(p, dst, src[0], src[1]);
978 brw_set_acc_write_control(p, 0);
979 break;
980 case BRW_OPCODE_SUBB:
981 assert(brw->gen >= 7);
982 brw_set_acc_write_control(p, 1);
983 brw_SUBB(p, dst, src[0], src[1]);
984 brw_set_acc_write_control(p, 0);
985 break;
986
987 case BRW_OPCODE_BFE:
988 assert(brw->gen >= 7);
989 brw_BFE(p, dst, src[0], src[1], src[2]);
990 break;
991
992 case BRW_OPCODE_BFI1:
993 assert(brw->gen >= 7);
994 brw_BFI1(p, dst, src[0], src[1]);
995 break;
996 case BRW_OPCODE_BFI2:
997 assert(brw->gen >= 7);
998 brw_BFI2(p, dst, src[0], src[1], src[2]);
999 break;
1000
1001 case BRW_OPCODE_IF:
1002 if (inst->src[0].file != BAD_FILE) {
1003 /* The instruction has an embedded compare (only allowed on gen6) */
1004 assert(brw->gen == 6);
1005 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1006 } else {
1007 struct brw_instruction *brw_inst = brw_IF(p, BRW_EXECUTE_8);
1008 brw_inst->header.predicate_control = inst->predicate;
1009 }
1010 break;
1011
1012 case BRW_OPCODE_ELSE:
1013 brw_ELSE(p);
1014 break;
1015 case BRW_OPCODE_ENDIF:
1016 brw_ENDIF(p);
1017 break;
1018
1019 case BRW_OPCODE_DO:
1020 brw_DO(p, BRW_EXECUTE_8);
1021 break;
1022
1023 case BRW_OPCODE_BREAK:
1024 brw_BREAK(p);
1025 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1026 break;
1027 case BRW_OPCODE_CONTINUE:
1028 /* FINISHME: We need to write the loop instruction support still. */
1029 if (brw->gen >= 6)
1030 gen6_CONT(p);
1031 else
1032 brw_CONT(p);
1033 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1034 break;
1035
1036 case BRW_OPCODE_WHILE:
1037 brw_WHILE(p);
1038 break;
1039
1040 case SHADER_OPCODE_RCP:
1041 case SHADER_OPCODE_RSQ:
1042 case SHADER_OPCODE_SQRT:
1043 case SHADER_OPCODE_EXP2:
1044 case SHADER_OPCODE_LOG2:
1045 case SHADER_OPCODE_SIN:
1046 case SHADER_OPCODE_COS:
1047 if (brw->gen == 6) {
1048 generate_math1_gen6(inst, dst, src[0]);
1049 } else {
1050 /* Also works for Gen7. */
1051 generate_math1_gen4(inst, dst, src[0]);
1052 }
1053 break;
1054
1055 case SHADER_OPCODE_POW:
1056 case SHADER_OPCODE_INT_QUOTIENT:
1057 case SHADER_OPCODE_INT_REMAINDER:
1058 if (brw->gen >= 7) {
1059 generate_math2_gen7(inst, dst, src[0], src[1]);
1060 } else if (brw->gen == 6) {
1061 generate_math2_gen6(inst, dst, src[0], src[1]);
1062 } else {
1063 generate_math2_gen4(inst, dst, src[0], src[1]);
1064 }
1065 break;
1066
1067 case SHADER_OPCODE_TEX:
1068 case SHADER_OPCODE_TXD:
1069 case SHADER_OPCODE_TXF:
1070 case SHADER_OPCODE_TXF_MS:
1071 case SHADER_OPCODE_TXL:
1072 case SHADER_OPCODE_TXS:
1073 case SHADER_OPCODE_TG4:
1074 generate_tex(inst, dst, src[0]);
1075 break;
1076
1077 case VS_OPCODE_URB_WRITE:
1078 generate_vs_urb_write(inst);
1079 break;
1080
1081 case VS_OPCODE_SCRATCH_READ:
1082 generate_scratch_read(inst, dst, src[0]);
1083 break;
1084
1085 case VS_OPCODE_SCRATCH_WRITE:
1086 generate_scratch_write(inst, dst, src[0], src[1]);
1087 break;
1088
1089 case VS_OPCODE_PULL_CONSTANT_LOAD:
1090 generate_pull_constant_load(inst, dst, src[0], src[1]);
1091 break;
1092
1093 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
1094 generate_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1095 break;
1096
1097 case GS_OPCODE_URB_WRITE:
1098 generate_gs_urb_write(inst);
1099 break;
1100
1101 case GS_OPCODE_THREAD_END:
1102 generate_gs_thread_end(inst);
1103 break;
1104
1105 case GS_OPCODE_SET_WRITE_OFFSET:
1106 generate_gs_set_write_offset(dst, src[0], src[1]);
1107 break;
1108
1109 case GS_OPCODE_SET_VERTEX_COUNT:
1110 generate_gs_set_vertex_count(dst, src[0]);
1111 break;
1112
1113 case GS_OPCODE_SET_DWORD_2_IMMED:
1114 generate_gs_set_dword_2_immed(dst, src[0]);
1115 break;
1116
1117 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
1118 generate_gs_prepare_channel_masks(dst);
1119 break;
1120
1121 case GS_OPCODE_SET_CHANNEL_MASKS:
1122 generate_gs_set_channel_masks(dst, src[0]);
1123 break;
1124
1125 case SHADER_OPCODE_SHADER_TIME_ADD:
1126 brw_shader_time_add(p, src[0], SURF_INDEX_VEC4_SHADER_TIME);
1127 mark_surface_used(SURF_INDEX_VEC4_SHADER_TIME);
1128 break;
1129
1130 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
1131 generate_unpack_flags(inst, dst);
1132 break;
1133
1134 default:
1135 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
1136 _mesa_problem(&brw->ctx, "Unsupported opcode in `%s' in VS\n",
1137 opcode_descs[inst->opcode].name);
1138 } else {
1139 _mesa_problem(&brw->ctx, "Unsupported opcode %d in VS", inst->opcode);
1140 }
1141 abort();
1142 }
1143 }
1144
1145 void
1146 vec4_generator::generate_code(exec_list *instructions)
1147 {
1148 int last_native_insn_offset = 0;
1149 const char *last_annotation_string = NULL;
1150 const void *last_annotation_ir = NULL;
1151
1152 if (unlikely(debug_flag)) {
1153 if (shader) {
1154 printf("Native code for vertex shader %d:\n", shader_prog->Name);
1155 } else {
1156 printf("Native code for vertex program %d:\n", prog->Id);
1157 }
1158 }
1159
1160 foreach_list(node, instructions) {
1161 vec4_instruction *inst = (vec4_instruction *)node;
1162 struct brw_reg src[3], dst;
1163
1164 if (unlikely(debug_flag)) {
1165 if (last_annotation_ir != inst->ir) {
1166 last_annotation_ir = inst->ir;
1167 if (last_annotation_ir) {
1168 printf(" ");
1169 if (shader) {
1170 ((ir_instruction *) last_annotation_ir)->print();
1171 } else {
1172 const prog_instruction *vpi;
1173 vpi = (const prog_instruction *) inst->ir;
1174 printf("%d: ", (int)(vpi - prog->Instructions));
1175 _mesa_fprint_instruction_opt(stdout, vpi, 0,
1176 PROG_PRINT_DEBUG, NULL);
1177 }
1178 printf("\n");
1179 }
1180 }
1181 if (last_annotation_string != inst->annotation) {
1182 last_annotation_string = inst->annotation;
1183 if (last_annotation_string)
1184 printf(" %s\n", last_annotation_string);
1185 }
1186 }
1187
1188 for (unsigned int i = 0; i < 3; i++) {
1189 src[i] = inst->get_src(this->prog_data, i);
1190 }
1191 dst = inst->get_dst();
1192
1193 brw_set_conditionalmod(p, inst->conditional_mod);
1194 brw_set_predicate_control(p, inst->predicate);
1195 brw_set_predicate_inverse(p, inst->predicate_inverse);
1196 brw_set_saturate(p, inst->saturate);
1197 brw_set_mask_control(p, inst->force_writemask_all);
1198
1199 unsigned pre_emit_nr_insn = p->nr_insn;
1200
1201 generate_vec4_instruction(inst, dst, src);
1202
1203 if (inst->no_dd_clear || inst->no_dd_check) {
1204 assert(p->nr_insn == pre_emit_nr_insn + 1 ||
1205 !"no_dd_check or no_dd_clear set for IR emitting more "
1206 "than 1 instruction");
1207
1208 struct brw_instruction *last = &p->store[pre_emit_nr_insn];
1209
1210 if (inst->no_dd_clear)
1211 last->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED;
1212 if (inst->no_dd_check)
1213 last->header.dependency_control |= BRW_DEPENDENCY_NOTCHECKED;
1214 }
1215
1216 if (unlikely(debug_flag)) {
1217 brw_dump_compile(p, stdout,
1218 last_native_insn_offset, p->next_insn_offset);
1219 }
1220
1221 last_native_insn_offset = p->next_insn_offset;
1222 }
1223
1224 if (unlikely(debug_flag)) {
1225 printf("\n");
1226 }
1227
1228 brw_set_uip_jip(p);
1229
1230 /* OK, while the INTEL_DEBUG=vs above is very nice for debugging VS
1231 * emit issues, it doesn't get the jump distances into the output,
1232 * which is often something we want to debug. So this is here in
1233 * case you're doing that.
1234 */
1235 if (0 && unlikely(debug_flag)) {
1236 brw_dump_compile(p, stdout, 0, p->next_insn_offset);
1237 }
1238 }
1239
1240 const unsigned *
1241 vec4_generator::generate_assembly(exec_list *instructions,
1242 unsigned *assembly_size)
1243 {
1244 brw_set_access_mode(p, BRW_ALIGN_16);
1245 generate_code(instructions);
1246 return brw_get_program(p, assembly_size);
1247 }
1248
1249 } /* namespace brw */