457e8003be7545562f3efcda5a98bf30b1f5e71a
[mesa.git] / src / mesa / drivers / dri / i965 / gen8_vec4_generator.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26
27 extern "C" {
28 #include "brw_eu.h"
29 #include "main/macros.h"
30 #include "program/prog_print.h"
31 #include "program/prog_parameter.h"
32 };
33
34 namespace brw {
35
36 gen8_vec4_generator::gen8_vec4_generator(struct brw_context *brw,
37 struct gl_shader_program *shader_prog,
38 struct gl_program *prog,
39 struct brw_vec4_prog_data *prog_data,
40 void *mem_ctx,
41 bool debug_flag)
42 : gen8_generator(brw, shader_prog, prog, mem_ctx),
43 prog_data(prog_data),
44 debug_flag(debug_flag)
45 {
46 }
47
48 gen8_vec4_generator::~gen8_vec4_generator()
49 {
50 }
51
52 void
53 gen8_vec4_generator::generate_tex(vec4_instruction *ir, struct brw_reg dst)
54 {
55 int msg_type = 0;
56
57 switch (ir->opcode) {
58 case SHADER_OPCODE_TEX:
59 case SHADER_OPCODE_TXL:
60 if (ir->shadow_compare) {
61 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
62 } else {
63 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
64 }
65 break;
66 case SHADER_OPCODE_TXD:
67 if (ir->shadow_compare) {
68 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
69 } else {
70 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
71 }
72 break;
73 case SHADER_OPCODE_TXF:
74 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
75 break;
76 case SHADER_OPCODE_TXF_CMS:
77 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
78 break;
79 case SHADER_OPCODE_TXF_MCS:
80 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
81 break;
82 case SHADER_OPCODE_TXS:
83 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
84 break;
85 case SHADER_OPCODE_TG4:
86 if (ir->shadow_compare) {
87 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
88 } else {
89 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
90 }
91 break;
92 case SHADER_OPCODE_TG4_OFFSET:
93 if (ir->shadow_compare) {
94 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
95 } else {
96 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
97 }
98 break;
99 default:
100 assert(!"should not get here: invalid VS texture opcode");
101 break;
102 }
103
104 if (ir->header_present) {
105 MOV_RAW(retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD),
106 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
107
108 default_state.access_mode = BRW_ALIGN_1;
109
110 if (ir->texture_offset) {
111 /* Set the offset bits in DWord 2. */
112 MOV_RAW(retype(brw_vec1_reg(MRF, ir->base_mrf, 2),
113 BRW_REGISTER_TYPE_UD),
114 brw_imm_ud(ir->texture_offset));
115 }
116
117 if (ir->sampler >= 16) {
118 /* The "Sampler Index" field can only store values between 0 and 15.
119 * However, we can add an offset to the "Sampler State Pointer"
120 * field, effectively selecting a different set of 16 samplers.
121 *
122 * The "Sampler State Pointer" needs to be aligned to a 32-byte
123 * offset, and each sampler state is only 16-bytes, so we can't
124 * exclusively use the offset - we have to use both.
125 */
126 gen8_instruction *add =
127 ADD(get_element_ud(brw_message_reg(ir->base_mrf), 3),
128 get_element_ud(brw_vec8_grf(0, 0), 3),
129 brw_imm_ud(16 * (ir->sampler / 16) *
130 sizeof(gen7_sampler_state)));
131 gen8_set_mask_control(add, BRW_MASK_DISABLE);
132 }
133
134 default_state.access_mode = BRW_ALIGN_16;
135 }
136
137 uint32_t surf_index =
138 prog_data->base.binding_table.texture_start + ir->sampler;
139
140 gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
141 gen8_set_dst(brw, inst, dst);
142 gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
143 gen8_set_sampler_message(brw, inst,
144 surf_index,
145 ir->sampler % 16,
146 msg_type,
147 1,
148 ir->mlen,
149 ir->header_present,
150 BRW_SAMPLER_SIMD_MODE_SIMD4X2);
151
152 brw_mark_surface_used(&prog_data->base, surf_index);
153 }
154
155 void
156 gen8_vec4_generator::generate_urb_write(vec4_instruction *ir, bool vs)
157 {
158 struct brw_reg header = brw_vec8_grf(GEN7_MRF_HACK_START + ir->base_mrf, 0);
159
160 /* Copy g0. */
161 if (vs)
162 MOV_RAW(header, brw_vec8_grf(0, 0));
163
164 gen8_instruction *inst;
165 if (!(ir->urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
166 /* Enable Channel Masks in the URB_WRITE_OWORD message header */
167 default_state.access_mode = BRW_ALIGN_1;
168 MOV_RAW(brw_vec1_grf(GEN7_MRF_HACK_START + ir->base_mrf, 5),
169 brw_imm_ud(0xff00));
170 default_state.access_mode = BRW_ALIGN_16;
171 }
172
173 inst = next_inst(BRW_OPCODE_SEND);
174 gen8_set_urb_message(brw, inst, ir->urb_write_flags, ir->mlen, 0, ir->offset,
175 true);
176 gen8_set_dst(brw, inst, brw_null_reg());
177 gen8_set_src0(brw, inst, header);
178 }
179
180 void
181 gen8_vec4_generator::generate_gs_set_vertex_count(struct brw_reg eot_mrf_header,
182 struct brw_reg src)
183 {
184 /* Move the vertex count into the second MRF for the EOT write. */
185 assert(eot_mrf_header.file == BRW_MESSAGE_REGISTER_FILE);
186 int dst_nr = GEN7_MRF_HACK_START + eot_mrf_header.nr + 1;
187 MOV(retype(brw_vec8_grf(dst_nr, 0), BRW_REGISTER_TYPE_UD), src);
188 }
189
190 void
191 gen8_vec4_generator::generate_gs_thread_end(vec4_instruction *ir)
192 {
193 struct brw_reg src = brw_vec8_grf(GEN7_MRF_HACK_START + ir->base_mrf, 0);
194 gen8_instruction *inst;
195
196 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
197 default_state.access_mode = BRW_ALIGN_1;
198 inst = OR(retype(brw_vec1_grf(GEN7_MRF_HACK_START + ir->base_mrf, 5),
199 BRW_REGISTER_TYPE_UD),
200 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
201 brw_imm_ud(0xff00)); /* could be 0x1100 but shouldn't matter */
202 gen8_set_mask_control(inst, BRW_MASK_DISABLE);
203 default_state.access_mode = BRW_ALIGN_16;
204
205 /* mlen = 2: g0 header + vertex count */
206 inst = next_inst(BRW_OPCODE_SEND);
207 gen8_set_urb_message(brw, inst, BRW_URB_WRITE_EOT, 2, 0, 0, true);
208 gen8_set_dst(brw, inst, brw_null_reg());
209 gen8_set_src0(brw, inst, src);
210 }
211
212 void
213 gen8_vec4_generator::generate_gs_set_write_offset(struct brw_reg dst,
214 struct brw_reg src0,
215 struct brw_reg src1)
216 {
217 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
218 * Header: M0.3):
219 *
220 * Slot 0 Offset. This field, after adding to the Global Offset field
221 * in the message descriptor, specifies the offset (in 256-bit units)
222 * from the start of the URB entry, as referenced by URB Handle 0, at
223 * which the data will be accessed.
224 *
225 * Similar text describes DWORD M0.4, which is slot 1 offset.
226 *
227 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
228 * of the register for geometry shader invocations 0 and 1) by the
229 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
230 *
231 * We can do this with the following EU instruction:
232 *
233 * mul(2) dst.3<1>UD src0<8;2,4>UD src1 { Align1 WE_all }
234 */
235 default_state.access_mode = BRW_ALIGN_1;
236 gen8_instruction *inst =
237 MUL(suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), src1);
238 gen8_set_mask_control(inst, BRW_MASK_DISABLE);
239 default_state.access_mode = BRW_ALIGN_16;
240 }
241
242 void
243 gen8_vec4_generator::generate_gs_set_dword_2_immed(struct brw_reg dst,
244 struct brw_reg src)
245 {
246 assert(src.file == BRW_IMMEDIATE_VALUE);
247
248 default_state.access_mode = BRW_ALIGN_1;
249
250 gen8_instruction *inst = MOV(suboffset(vec1(dst), 2), src);
251 gen8_set_mask_control(inst, BRW_MASK_DISABLE);
252
253 default_state.access_mode = BRW_ALIGN_16;
254 }
255
256 void
257 gen8_vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst)
258 {
259 /* We want to left shift just DWORD 4 (the x component belonging to the
260 * second geometry shader invocation) by 4 bits. So generate the
261 * instruction:
262 *
263 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
264 */
265 dst = suboffset(vec1(dst), 4);
266 default_state.access_mode = BRW_ALIGN_1;
267 gen8_instruction *inst = SHL(dst, dst, brw_imm_ud(4));
268 gen8_set_mask_control(inst, BRW_MASK_DISABLE);
269 default_state.access_mode = BRW_ALIGN_16;
270 }
271
272 void
273 gen8_vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst,
274 struct brw_reg src)
275 {
276 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
277 * Header: M0.5):
278 *
279 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
280 *
281 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
282 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
283 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding
284 * channel enable to determine the final channel enable. For the
285 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel
286 * enable is 1 it indicates that Vertex 1 DATA [3] will be included
287 * in the writeback message. For the URB_WRITE_OWORD &
288 * URB_WRITE_HWORD messages, when final channel enable is 1 it
289 * indicates that Vertex 1 DATA [3] will be written to the surface.
290 *
291 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
292 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
293 *
294 * 14 Vertex 1 DATA [2] Channel Mask
295 * 13 Vertex 1 DATA [1] Channel Mask
296 * 12 Vertex 1 DATA [0] Channel Mask
297 * 11 Vertex 0 DATA [3] Channel Mask
298 * 10 Vertex 0 DATA [2] Channel Mask
299 * 9 Vertex 0 DATA [1] Channel Mask
300 * 8 Vertex 0 DATA [0] Channel Mask
301 *
302 * (This is from a section of the PRM that is agnostic to the particular
303 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
304 * geometry shader invocations 0 and 1, respectively). Since we have the
305 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
306 * and the enable flags for geometry shader invocation 1 in bits 7:0 of
307 * DWORD 4, we just need to OR them together and store the result in bits
308 * 15:8 of DWORD 5.
309 *
310 * It's easier to get the EU to do this if we think of the src and dst
311 * registers as composed of 32 bytes each; then, we want to pick up the
312 * contents of bytes 0 and 16 from src, OR them together, and store them in
313 * byte 21.
314 *
315 * We can do that by the following EU instruction:
316 *
317 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
318 *
319 * Note: this relies on the source register having zeros in (a) bits 7:4 of
320 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the
321 * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
322 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
323 * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
324 * contain valid channel mask values (which are in the range 0x0-0xf).
325 */
326 dst = retype(dst, BRW_REGISTER_TYPE_UB);
327 src = retype(src, BRW_REGISTER_TYPE_UB);
328
329 default_state.access_mode = BRW_ALIGN_1;
330
331 gen8_instruction *inst =
332 OR(suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
333 gen8_set_mask_control(inst, BRW_MASK_DISABLE);
334
335 default_state.access_mode = BRW_ALIGN_16;
336 }
337
338 void
339 gen8_vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
340 struct brw_reg index)
341 {
342 int second_vertex_offset = 1;
343
344 m1 = retype(m1, BRW_REGISTER_TYPE_D);
345
346 /* Set up M1 (message payload). Only the block offsets in M1.0 and
347 * M1.4 are used, and the rest are ignored.
348 */
349 struct brw_reg m1_0 = suboffset(vec1(m1), 0);
350 struct brw_reg m1_4 = suboffset(vec1(m1), 4);
351 struct brw_reg index_0 = suboffset(vec1(index), 0);
352 struct brw_reg index_4 = suboffset(vec1(index), 4);
353
354 default_state.mask_control = BRW_MASK_DISABLE;
355 default_state.access_mode = BRW_ALIGN_1;
356
357 MOV(m1_0, index_0);
358
359 if (index.file == BRW_IMMEDIATE_VALUE) {
360 index_4.dw1.ud += second_vertex_offset;
361 MOV(m1_4, index_4);
362 } else {
363 ADD(m1_4, index_4, brw_imm_d(second_vertex_offset));
364 }
365
366 default_state.mask_control = BRW_MASK_ENABLE;
367 default_state.access_mode = BRW_ALIGN_16;
368 }
369
370 void
371 gen8_vec4_generator::generate_scratch_read(vec4_instruction *ir,
372 struct brw_reg dst,
373 struct brw_reg index)
374 {
375 struct brw_reg header = brw_vec8_grf(GEN7_MRF_HACK_START + ir->base_mrf, 0);
376
377 MOV_RAW(header, brw_vec8_grf(0, 0));
378
379 generate_oword_dual_block_offsets(brw_message_reg(ir->base_mrf + 1), index);
380
381 /* Each of the 8 channel enables is considered for whether each
382 * dword is written.
383 */
384 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
385 gen8_set_dst(brw, send, dst);
386 gen8_set_src0(brw, send, header);
387 gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE,
388 255, /* binding table index: stateless access */
389 GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ,
390 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
391 2, /* mlen */
392 1, /* rlen */
393 true, /* header present */
394 false); /* EOT */
395 }
396
397 void
398 gen8_vec4_generator::generate_scratch_write(vec4_instruction *ir,
399 struct brw_reg dst,
400 struct brw_reg src,
401 struct brw_reg index)
402 {
403 struct brw_reg header = brw_vec8_grf(GEN7_MRF_HACK_START + ir->base_mrf, 0);
404
405 MOV_RAW(header, brw_vec8_grf(0, 0));
406
407 generate_oword_dual_block_offsets(brw_message_reg(ir->base_mrf + 1), index);
408
409 MOV(retype(brw_message_reg(ir->base_mrf + 2), BRW_REGISTER_TYPE_D),
410 retype(src, BRW_REGISTER_TYPE_D));
411
412 /* Each of the 8 channel enables is considered for whether each
413 * dword is written.
414 */
415 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
416 gen8_set_dst(brw, send, dst);
417 gen8_set_src0(brw, send, header);
418 gen8_set_pred_control(send, ir->predicate);
419 gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE,
420 255, /* binding table index: stateless access */
421 GEN7_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE,
422 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
423 3, /* mlen */
424 0, /* rlen */
425 true, /* header present */
426 false); /* EOT */
427 }
428
429 void
430 gen8_vec4_generator::generate_pull_constant_load(vec4_instruction *inst,
431 struct brw_reg dst,
432 struct brw_reg index,
433 struct brw_reg offset)
434 {
435 assert(index.file == BRW_IMMEDIATE_VALUE &&
436 index.type == BRW_REGISTER_TYPE_UD);
437 uint32_t surf_index = index.dw1.ud;
438
439 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
440
441 /* Each of the 8 channel enables is considered for whether each
442 * dword is written.
443 */
444 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
445 gen8_set_dst(brw, send, dst);
446 gen8_set_src0(brw, send, offset);
447 gen8_set_sampler_message(brw, send,
448 surf_index,
449 0, /* The LD message ignores the sampler unit. */
450 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
451 1, /* rlen */
452 1, /* mlen */
453 false, /* no header */
454 BRW_SAMPLER_SIMD_MODE_SIMD4X2);
455
456 brw_mark_surface_used(&prog_data->base, surf_index);
457 }
458
459 void
460 gen8_vec4_generator::generate_untyped_atomic(vec4_instruction *ir,
461 struct brw_reg dst,
462 struct brw_reg atomic_op,
463 struct brw_reg surf_index)
464 {
465 assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
466 atomic_op.type == BRW_REGISTER_TYPE_UD &&
467 surf_index.file == BRW_IMMEDIATE_VALUE &&
468 surf_index.type == BRW_REGISTER_TYPE_UD);
469 assert((atomic_op.dw1.ud & ~0xf) == 0);
470
471 unsigned msg_control =
472 atomic_op.dw1.ud | /* Atomic Operation Type: BRW_AOP_* */
473 (1 << 5); /* Return data expected */
474
475 gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
476 gen8_set_dst(brw, inst, retype(dst, BRW_REGISTER_TYPE_UD));
477 gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
478 gen8_set_dp_message(brw, inst, HSW_SFID_DATAPORT_DATA_CACHE_1,
479 surf_index.dw1.ud,
480 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2,
481 msg_control,
482 ir->mlen,
483 1,
484 ir->header_present,
485 false);
486
487 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
488 }
489
490
491
492 void
493 gen8_vec4_generator::generate_untyped_surface_read(vec4_instruction *ir,
494 struct brw_reg dst,
495 struct brw_reg surf_index)
496 {
497 assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
498 surf_index.type == BRW_REGISTER_TYPE_UD);
499
500 gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
501 gen8_set_dst(brw, inst, retype(dst, BRW_REGISTER_TYPE_UD));
502 gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
503 gen8_set_dp_message(brw, inst, HSW_SFID_DATAPORT_DATA_CACHE_1,
504 surf_index.dw1.ud,
505 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ,
506 0xe, /* enable only the R channel */
507 ir->mlen,
508 1,
509 ir->header_present,
510 false);
511
512 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
513 }
514
515
516 void
517 gen8_vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
518 struct brw_reg dst,
519 struct brw_reg *src)
520 {
521 vec4_instruction *ir = (vec4_instruction *) instruction;
522
523 if (dst.width == BRW_WIDTH_4) {
524 /* This happens in attribute fixups for "dual instanced" geometry
525 * shaders, since they use attributes that are vec4's. Since the exec
526 * width is only 4, it's essential that the caller set
527 * force_writemask_all in order to make sure the instruction is executed
528 * regardless of which channels are enabled.
529 */
530 assert(ir->force_writemask_all);
531
532 /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy
533 * the following register region restrictions (from Graphics BSpec:
534 * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions
535 * > Register Region Restrictions)
536 *
537 * 1. ExecSize must be greater than or equal to Width.
538 *
539 * 2. If ExecSize = Width and HorzStride != 0, VertStride must be set
540 * to Width * HorzStride."
541 */
542 for (int i = 0; i < 3; i++) {
543 if (src[i].file == BRW_GENERAL_REGISTER_FILE)
544 src[i] = stride(src[i], 4, 4, 1);
545 }
546 }
547
548 switch (ir->opcode) {
549 case BRW_OPCODE_MOV:
550 MOV(dst, src[0]);
551 break;
552
553 case BRW_OPCODE_ADD:
554 ADD(dst, src[0], src[1]);
555 break;
556
557 case BRW_OPCODE_MUL:
558 MUL(dst, src[0], src[1]);
559 break;
560
561 case BRW_OPCODE_MACH:
562 MACH(dst, src[0], src[1]);
563 break;
564
565 case BRW_OPCODE_MAD:
566 MAD(dst, src[0], src[1], src[2]);
567 break;
568
569 case BRW_OPCODE_FRC:
570 FRC(dst, src[0]);
571 break;
572
573 case BRW_OPCODE_RNDD:
574 RNDD(dst, src[0]);
575 break;
576
577 case BRW_OPCODE_RNDE:
578 RNDE(dst, src[0]);
579 break;
580
581 case BRW_OPCODE_RNDZ:
582 RNDZ(dst, src[0]);
583 break;
584
585 case BRW_OPCODE_AND:
586 AND(dst, src[0], src[1]);
587 break;
588
589 case BRW_OPCODE_OR:
590 OR(dst, src[0], src[1]);
591 break;
592
593 case BRW_OPCODE_XOR:
594 XOR(dst, src[0], src[1]);
595 break;
596
597 case BRW_OPCODE_NOT:
598 NOT(dst, src[0]);
599 break;
600
601 case BRW_OPCODE_ASR:
602 ASR(dst, src[0], src[1]);
603 break;
604
605 case BRW_OPCODE_SHR:
606 SHR(dst, src[0], src[1]);
607 break;
608
609 case BRW_OPCODE_SHL:
610 SHL(dst, src[0], src[1]);
611 break;
612
613 case BRW_OPCODE_CMP:
614 CMP(dst, ir->conditional_mod, src[0], src[1]);
615 break;
616
617 case BRW_OPCODE_SEL:
618 SEL(dst, src[0], src[1]);
619 break;
620
621 case BRW_OPCODE_DPH:
622 DPH(dst, src[0], src[1]);
623 break;
624
625 case BRW_OPCODE_DP4:
626 DP4(dst, src[0], src[1]);
627 break;
628
629 case BRW_OPCODE_DP3:
630 DP3(dst, src[0], src[1]);
631 break;
632
633 case BRW_OPCODE_DP2:
634 DP2(dst, src[0], src[1]);
635 break;
636
637 case BRW_OPCODE_F32TO16:
638 /* Emulate the Gen7 zeroing bug. */
639 MOV(retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
640 MOV(retype(dst, BRW_REGISTER_TYPE_HF), src[0]);
641 break;
642
643 case BRW_OPCODE_F16TO32:
644 MOV(dst, retype(src[0], BRW_REGISTER_TYPE_HF));
645 break;
646
647 case BRW_OPCODE_LRP:
648 LRP(dst, src[0], src[1], src[2]);
649 break;
650
651 case BRW_OPCODE_BFREV:
652 /* BFREV only supports UD type for src and dst. */
653 BFREV(retype(dst, BRW_REGISTER_TYPE_UD),
654 retype(src[0], BRW_REGISTER_TYPE_UD));
655 break;
656
657 case BRW_OPCODE_FBH:
658 /* FBH only supports UD type for dst. */
659 FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
660 break;
661
662 case BRW_OPCODE_FBL:
663 /* FBL only supports UD type for dst. */
664 FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
665 break;
666
667 case BRW_OPCODE_CBIT:
668 /* CBIT only supports UD type for dst. */
669 CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
670 break;
671
672 case BRW_OPCODE_ADDC:
673 ADDC(dst, src[0], src[1]);
674 break;
675
676 case BRW_OPCODE_SUBB:
677 SUBB(dst, src[0], src[1]);
678 break;
679
680 case BRW_OPCODE_BFE:
681 BFE(dst, src[0], src[1], src[2]);
682 break;
683
684 case BRW_OPCODE_BFI1:
685 BFI1(dst, src[0], src[1]);
686 break;
687
688 case BRW_OPCODE_BFI2:
689 BFI2(dst, src[0], src[1], src[2]);
690 break;
691
692 case BRW_OPCODE_IF:
693 IF(ir->predicate);
694 break;
695
696 case BRW_OPCODE_ELSE:
697 ELSE();
698 break;
699
700 case BRW_OPCODE_ENDIF:
701 ENDIF();
702 break;
703
704 case BRW_OPCODE_DO:
705 DO();
706 break;
707
708 case BRW_OPCODE_BREAK:
709 BREAK();
710 break;
711
712 case BRW_OPCODE_CONTINUE:
713 CONTINUE();
714 break;
715
716 case BRW_OPCODE_WHILE:
717 WHILE();
718 break;
719
720 case SHADER_OPCODE_RCP:
721 MATH(BRW_MATH_FUNCTION_INV, dst, src[0]);
722 break;
723
724 case SHADER_OPCODE_RSQ:
725 MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]);
726 break;
727
728 case SHADER_OPCODE_SQRT:
729 MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]);
730 break;
731
732 case SHADER_OPCODE_EXP2:
733 MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]);
734 break;
735
736 case SHADER_OPCODE_LOG2:
737 MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]);
738 break;
739
740 case SHADER_OPCODE_SIN:
741 MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]);
742 break;
743
744 case SHADER_OPCODE_COS:
745 MATH(BRW_MATH_FUNCTION_COS, dst, src[0]);
746 break;
747
748 case SHADER_OPCODE_POW:
749 MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]);
750 break;
751
752 case SHADER_OPCODE_INT_QUOTIENT:
753 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]);
754 break;
755
756 case SHADER_OPCODE_INT_REMAINDER:
757 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]);
758 break;
759
760 case SHADER_OPCODE_TEX:
761 case SHADER_OPCODE_TXD:
762 case SHADER_OPCODE_TXF:
763 case SHADER_OPCODE_TXF_CMS:
764 case SHADER_OPCODE_TXF_MCS:
765 case SHADER_OPCODE_TXL:
766 case SHADER_OPCODE_TXS:
767 case SHADER_OPCODE_TG4:
768 case SHADER_OPCODE_TG4_OFFSET:
769 generate_tex(ir, dst);
770 break;
771
772 case VS_OPCODE_URB_WRITE:
773 generate_urb_write(ir, true);
774 break;
775
776 case SHADER_OPCODE_GEN4_SCRATCH_READ:
777 generate_scratch_read(ir, dst, src[0]);
778 break;
779
780 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
781 generate_scratch_write(ir, dst, src[0], src[1]);
782 break;
783
784 case VS_OPCODE_PULL_CONSTANT_LOAD:
785 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
786 generate_pull_constant_load(ir, dst, src[0], src[1]);
787 break;
788
789 case GS_OPCODE_URB_WRITE:
790 generate_urb_write(ir, false);
791 break;
792
793 case GS_OPCODE_THREAD_END:
794 generate_gs_thread_end(ir);
795 break;
796
797 case GS_OPCODE_SET_WRITE_OFFSET:
798 generate_gs_set_write_offset(dst, src[0], src[1]);
799 break;
800
801 case GS_OPCODE_SET_VERTEX_COUNT:
802 generate_gs_set_vertex_count(dst, src[0]);
803 break;
804
805 case GS_OPCODE_SET_DWORD_2_IMMED:
806 generate_gs_set_dword_2_immed(dst, src[0]);
807 break;
808
809 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
810 generate_gs_prepare_channel_masks(dst);
811 break;
812
813 case GS_OPCODE_SET_CHANNEL_MASKS:
814 generate_gs_set_channel_masks(dst, src[0]);
815 break;
816
817 case SHADER_OPCODE_SHADER_TIME_ADD:
818 assert(!"XXX: Missing Gen8 vec4 support for INTEL_DEBUG=shader_time");
819 break;
820
821 case SHADER_OPCODE_UNTYPED_ATOMIC:
822 generate_untyped_atomic(ir, dst, src[0], src[1]);
823 break;
824
825 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
826 generate_untyped_surface_read(ir, dst, src[0]);
827 break;
828
829 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
830 assert(!"VS_OPCODE_UNPACK_FLAGS_SIMD4X2 should not be used on Gen8+.");
831 break;
832
833 default:
834 if (ir->opcode < (int) ARRAY_SIZE(opcode_descs)) {
835 _mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n",
836 opcode_descs[ir->opcode].name);
837 } else {
838 _mesa_problem(ctx, "Unsupported opcode %d in VS", ir->opcode);
839 }
840 abort();
841 }
842 }
843
844 void
845 gen8_vec4_generator::generate_code(exec_list *instructions)
846 {
847 struct annotation_info annotation;
848 memset(&annotation, 0, sizeof(annotation));
849
850 cfg_t *cfg = NULL;
851 if (unlikely(debug_flag))
852 cfg = new(mem_ctx) cfg_t(instructions);
853
854 foreach_in_list(vec4_instruction, ir, instructions) {
855 struct brw_reg src[3], dst;
856
857 if (unlikely(debug_flag))
858 annotate(brw, &annotation, cfg, ir, next_inst_offset);
859
860 for (unsigned int i = 0; i < 3; i++) {
861 src[i] = ir->get_src(prog_data, i);
862 }
863 dst = ir->get_dst();
864
865 default_state.conditional_mod = ir->conditional_mod;
866 default_state.predicate = ir->predicate;
867 default_state.predicate_inverse = ir->predicate_inverse;
868 default_state.saturate = ir->saturate;
869
870 const unsigned pre_emit_nr_inst = nr_inst;
871
872 generate_vec4_instruction(ir, dst, src);
873
874 if (ir->no_dd_clear || ir->no_dd_check) {
875 assert(nr_inst == pre_emit_nr_inst + 1 ||
876 !"no_dd_check or no_dd_clear set for IR emitting more "
877 "than 1 instruction");
878
879 gen8_instruction *last = &store[pre_emit_nr_inst];
880 gen8_set_no_dd_clear(last, ir->no_dd_clear);
881 gen8_set_no_dd_check(last, ir->no_dd_check);
882 }
883 }
884
885 patch_jump_targets();
886 annotation_finalize(&annotation, next_inst_offset);
887
888 int before_size = next_inst_offset;
889
890 if (unlikely(debug_flag)) {
891 if (shader_prog) {
892 fprintf(stderr, "Native code for %s vertex shader %d:\n",
893 shader_prog->Label ? shader_prog->Label : "unnamed",
894 shader_prog->Name);
895 } else {
896 fprintf(stderr, "Native code for vertex program %d:\n", prog->Id);
897 }
898 fprintf(stderr, "vec4 shader: %d instructions.\n", before_size / 16);
899
900 dump_assembly(store, annotation.ann_count, annotation.ann, brw, prog);
901 ralloc_free(annotation.ann);
902 }
903 }
904
905 const unsigned *
906 gen8_vec4_generator::generate_assembly(exec_list *instructions,
907 unsigned *assembly_size)
908 {
909 default_state.access_mode = BRW_ALIGN_16;
910 default_state.exec_size = BRW_EXECUTE_8;
911 generate_code(instructions);
912
913 *assembly_size = next_inst_offset;
914 return (const unsigned *) store;
915 }
916
917 } /* namespace brw */