1d7fc6c9b4073896f8e192842c408eb6b0298636
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_generator.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_generator.cpp
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32 #include "brw_cfg.h"
33 #include "brw_program.h"
34
35 static enum brw_reg_file
36 brw_file_from_reg(fs_reg *reg)
37 {
38 switch (reg->file) {
39 case ARF:
40 return BRW_ARCHITECTURE_REGISTER_FILE;
41 case FIXED_GRF:
42 case VGRF:
43 return BRW_GENERAL_REGISTER_FILE;
44 case MRF:
45 return BRW_MESSAGE_REGISTER_FILE;
46 case IMM:
47 return BRW_IMMEDIATE_VALUE;
48 case BAD_FILE:
49 case ATTR:
50 case UNIFORM:
51 unreachable("not reached");
52 }
53 return BRW_ARCHITECTURE_REGISTER_FILE;
54 }
55
56 static struct brw_reg
57 brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed)
58 {
59 struct brw_reg brw_reg;
60
61 switch (reg->file) {
62 case MRF:
63 assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen));
64 /* Fallthrough */
65 case VGRF:
66 if (reg->stride == 0) {
67 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
68 } else {
69 /* From the Haswell PRM:
70 *
71 * "VertStride must be used to cross GRF register boundaries. This
72 * rule implies that elements within a 'Width' cannot cross GRF
73 * boundaries."
74 *
75 * The maximum width value that could satisfy this restriction is:
76 */
77 const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
78
79 /* Because the hardware can only split source regions at a whole
80 * multiple of width during decompression (i.e. vertically), clamp
81 * the value obtained above to the physical execution size of a
82 * single decompressed chunk of the instruction:
83 */
84 const unsigned phys_width = compressed ? inst->exec_size / 2 :
85 inst->exec_size;
86
87 /* XXX - The equation above is strictly speaking not correct on
88 * hardware that supports unbalanced GRF writes -- On Gen9+
89 * each decompressed chunk of the instruction may have a
90 * different execution size when the number of components
91 * written to each destination GRF is not the same.
92 */
93 const unsigned width = MIN2(reg_width, phys_width);
94 brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
95 brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
96 }
97
98 brw_reg = retype(brw_reg, reg->type);
99 brw_reg = byte_offset(brw_reg, reg->subreg_offset);
100 brw_reg.abs = reg->abs;
101 brw_reg.negate = reg->negate;
102 break;
103 case ARF:
104 case FIXED_GRF:
105 case IMM:
106 brw_reg = reg->as_brw_reg();
107 break;
108 case BAD_FILE:
109 /* Probably unused. */
110 brw_reg = brw_null_reg();
111 break;
112 case ATTR:
113 case UNIFORM:
114 unreachable("not reached");
115 }
116
117 return brw_reg;
118 }
119
120 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
121 void *mem_ctx,
122 const void *key,
123 struct brw_stage_prog_data *prog_data,
124 unsigned promoted_constants,
125 bool runtime_check_aads_emit,
126 gl_shader_stage stage)
127
128 : compiler(compiler), log_data(log_data),
129 devinfo(compiler->devinfo), key(key),
130 prog_data(prog_data),
131 promoted_constants(promoted_constants),
132 runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
133 stage(stage), mem_ctx(mem_ctx)
134 {
135 p = rzalloc(mem_ctx, struct brw_codegen);
136 brw_init_codegen(devinfo, p, mem_ctx);
137 }
138
139 fs_generator::~fs_generator()
140 {
141 }
142
143 class ip_record : public exec_node {
144 public:
145 DECLARE_RALLOC_CXX_OPERATORS(ip_record)
146
147 ip_record(int ip)
148 {
149 this->ip = ip;
150 }
151
152 int ip;
153 };
154
155 bool
156 fs_generator::patch_discard_jumps_to_fb_writes()
157 {
158 if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
159 return false;
160
161 int scale = brw_jump_scale(p->devinfo);
162
163 /* There is a somewhat strange undocumented requirement of using
164 * HALT, according to the simulator. If some channel has HALTed to
165 * a particular UIP, then by the end of the program, every channel
166 * must have HALTed to that UIP. Furthermore, the tracking is a
167 * stack, so you can't do the final halt of a UIP after starting
168 * halting to a new UIP.
169 *
170 * Symptoms of not emitting this instruction on actual hardware
171 * included GPU hangs and sparkly rendering on the piglit discard
172 * tests.
173 */
174 brw_inst *last_halt = gen6_HALT(p);
175 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
176 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
177
178 int ip = p->nr_insn;
179
180 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
181 brw_inst *patch = &p->store[patch_ip->ip];
182
183 assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
184 /* HALT takes a half-instruction distance from the pre-incremented IP. */
185 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
186 }
187
188 this->discard_halt_patches.make_empty();
189 return true;
190 }
191
192 void
193 fs_generator::fire_fb_write(fs_inst *inst,
194 struct brw_reg payload,
195 struct brw_reg implied_header,
196 GLuint nr)
197 {
198 uint32_t msg_control;
199
200 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
201
202 if (devinfo->gen < 6) {
203 brw_push_insn_state(p);
204 brw_set_default_exec_size(p, BRW_EXECUTE_8);
205 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
206 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
207 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
208 brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
209 brw_pop_insn_state(p);
210 }
211
212 if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
213 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
214 else if (prog_data->dual_src_blend) {
215 if (!inst->force_sechalf)
216 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
217 else
218 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
219 } else if (inst->exec_size == 16)
220 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
221 else
222 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
223
224 uint32_t surf_index =
225 prog_data->binding_table.render_target_start + inst->target;
226
227 bool last_render_target = inst->eot ||
228 (prog_data->dual_src_blend && dispatch_width == 16);
229
230
231 brw_fb_WRITE(p,
232 payload,
233 implied_header,
234 msg_control,
235 surf_index,
236 nr,
237 0,
238 inst->eot,
239 last_render_target,
240 inst->header_size != 0);
241
242 brw_mark_surface_used(&prog_data->base, surf_index);
243 }
244
245 void
246 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
247 {
248 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
249 const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
250 struct brw_reg implied_header;
251
252 if (devinfo->gen < 8 && !devinfo->is_haswell) {
253 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
254 }
255
256 if (inst->base_mrf >= 0)
257 payload = brw_message_reg(inst->base_mrf);
258
259 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
260 * move, here's g1.
261 */
262 if (inst->header_size != 0) {
263 brw_push_insn_state(p);
264 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
265 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
266 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
267 brw_set_default_flag_reg(p, 0, 0);
268
269 /* On HSW, the GPU will use the predicate on SENDC, unless the header is
270 * present.
271 */
272 if (prog_data->uses_kill) {
273 struct brw_reg pixel_mask;
274
275 if (devinfo->gen >= 6)
276 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
277 else
278 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
279
280 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
281 }
282
283 if (devinfo->gen >= 6) {
284 brw_push_insn_state(p);
285 brw_set_default_exec_size(p, BRW_EXECUTE_16);
286 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
287 brw_MOV(p,
288 retype(payload, BRW_REGISTER_TYPE_UD),
289 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
290 brw_pop_insn_state(p);
291
292 if (inst->target > 0 && key->replicate_alpha) {
293 /* Set "Source0 Alpha Present to RenderTarget" bit in message
294 * header.
295 */
296 brw_OR(p,
297 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
298 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
299 brw_imm_ud(0x1 << 11));
300 }
301
302 if (inst->target > 0) {
303 /* Set the render target index for choosing BLEND_STATE. */
304 brw_MOV(p, retype(vec1(suboffset(payload, 2)),
305 BRW_REGISTER_TYPE_UD),
306 brw_imm_ud(inst->target));
307 }
308
309 /* Set computes stencil to render target */
310 if (prog_data->computed_stencil) {
311 brw_OR(p,
312 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
313 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
314 brw_imm_ud(0x1 << 14));
315 }
316
317 implied_header = brw_null_reg();
318 } else {
319 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
320 }
321
322 brw_pop_insn_state(p);
323 } else {
324 implied_header = brw_null_reg();
325 }
326
327 if (!runtime_check_aads_emit) {
328 fire_fb_write(inst, payload, implied_header, inst->mlen);
329 } else {
330 /* This can only happen in gen < 6 */
331 assert(devinfo->gen < 6);
332
333 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
334
335 /* Check runtime bit to detect if we have to send AA data or not */
336 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
337 brw_AND(p,
338 v1_null_ud,
339 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
340 brw_imm_ud(1<<26));
341 brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
342
343 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
344 brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
345 {
346 /* Don't send AA data */
347 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
348 }
349 brw_land_fwd_jump(p, jmp);
350 fire_fb_write(inst, payload, implied_header, inst->mlen);
351 }
352 }
353
354 void
355 fs_generator::generate_mov_indirect(fs_inst *inst,
356 struct brw_reg dst,
357 struct brw_reg reg,
358 struct brw_reg indirect_byte_offset)
359 {
360 assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
361 assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
362
363 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
364
365 if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
366 imm_byte_offset += indirect_byte_offset.ud;
367
368 reg.nr = imm_byte_offset / REG_SIZE;
369 reg.subnr = imm_byte_offset % REG_SIZE;
370 brw_MOV(p, dst, reg);
371 } else {
372 /* Prior to Broadwell, there are only 8 address registers. */
373 assert(inst->exec_size == 8 || devinfo->gen >= 8);
374
375 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
376 struct brw_reg addr = vec8(brw_address_reg(0));
377
378 /* The destination stride of an instruction (in bytes) must be greater
379 * than or equal to the size of the rest of the instruction. Since the
380 * address register is of type UW, we can't use a D-type instruction.
381 * In order to get around this, re retype to UW and use a stride.
382 */
383 indirect_byte_offset =
384 retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
385
386 struct brw_reg ind_src;
387 if (devinfo->gen < 8) {
388 /* From the Haswell PRM section "Register Region Restrictions":
389 *
390 * "The lower bits of the AddressImmediate must not overflow to
391 * change the register address. The lower 5 bits of Address
392 * Immediate when added to lower 5 bits of address register gives
393 * the sub-register offset. The upper bits of Address Immediate
394 * when added to upper bits of address register gives the register
395 * address. Any overflow from sub-register offset is dropped."
396 *
397 * This restriction is only listed in the Haswell PRM but emperical
398 * testing indicates that it applies on all older generations and is
399 * lifted on Broadwell.
400 *
401 * Since the indirect may cause us to cross a register boundary, this
402 * makes the base offset almost useless. We could try and do
403 * something clever where we use a actual base offset if
404 * base_offset % 32 == 0 but that would mean we were generating
405 * different code depending on the base offset. Instead, for the
406 * sake of consistency, we'll just do the add ourselves.
407 */
408 brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
409 ind_src = brw_VxH_indirect(0, 0);
410 } else {
411 brw_MOV(p, addr, indirect_byte_offset);
412 ind_src = brw_VxH_indirect(0, imm_byte_offset);
413 }
414
415 brw_inst *mov = brw_MOV(p, dst, retype(ind_src, dst.type));
416
417 if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
418 !inst->get_next()->is_tail_sentinel() &&
419 ((fs_inst *)inst->get_next())->mlen > 0) {
420 /* From the Sandybridge PRM:
421 *
422 * "[Errata: DevSNB(SNB)] If MRF register is updated by any
423 * instruction that “indexed/indirect” source AND is followed by a
424 * send, the instruction requires a “Switch”. This is to avoid
425 * race condition where send may dispatch before MRF is updated."
426 */
427 brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
428 }
429 }
430 }
431
432 void
433 fs_generator::generate_urb_read(fs_inst *inst,
434 struct brw_reg dst,
435 struct brw_reg header)
436 {
437 assert(header.file == BRW_GENERAL_REGISTER_FILE);
438 assert(header.type == BRW_REGISTER_TYPE_UD);
439
440 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
441 brw_set_dest(p, send, dst);
442 brw_set_src0(p, send, header);
443 brw_set_src1(p, send, brw_imm_ud(0u));
444
445 brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
446 brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
447
448 if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
449 brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
450
451 brw_inst_set_mlen(p->devinfo, send, inst->mlen);
452 brw_inst_set_rlen(p->devinfo, send, inst->regs_written);
453 brw_inst_set_header_present(p->devinfo, send, true);
454 brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
455 }
456
457 void
458 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
459 {
460 brw_inst *insn;
461
462 insn = brw_next_insn(p, BRW_OPCODE_SEND);
463
464 brw_set_dest(p, insn, brw_null_reg());
465 brw_set_src0(p, insn, payload);
466 brw_set_src1(p, insn, brw_imm_d(0));
467
468 brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
469 brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
470
471 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
472 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
473 brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
474
475 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
476 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
477 brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
478
479 brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
480 brw_inst_set_rlen(p->devinfo, insn, 0);
481 brw_inst_set_eot(p->devinfo, insn, inst->eot);
482 brw_inst_set_header_present(p->devinfo, insn, true);
483 brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
484 }
485
486 void
487 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
488 {
489 struct brw_inst *insn;
490
491 insn = brw_next_insn(p, BRW_OPCODE_SEND);
492
493 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
494 brw_set_src0(p, insn, payload);
495 brw_set_src1(p, insn, brw_imm_d(0));
496
497 /* Terminate a compute shader by sending a message to the thread spawner.
498 */
499 brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
500 brw_inst_set_mlen(devinfo, insn, 1);
501 brw_inst_set_rlen(devinfo, insn, 0);
502 brw_inst_set_eot(devinfo, insn, inst->eot);
503 brw_inst_set_header_present(devinfo, insn, false);
504
505 brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
506 brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
507
508 /* Note that even though the thread has a URB resource associated with it,
509 * we set the "do not dereference URB" bit, because the URB resource is
510 * managed by the fixed-function unit, so it will free it automatically.
511 */
512 brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
513
514 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
515 }
516
517 void
518 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
519 {
520 brw_barrier(p, src);
521 brw_WAIT(p);
522 }
523
524 void
525 fs_generator::generate_linterp(fs_inst *inst,
526 struct brw_reg dst, struct brw_reg *src)
527 {
528 /* PLN reads:
529 * / in SIMD16 \
530 * -----------------------------------
531 * | src1+0 | src1+1 | src1+2 | src1+3 |
532 * |-----------------------------------|
533 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
534 * -----------------------------------
535 *
536 * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
537 *
538 * -----------------------------------
539 * | src1+0 | src1+1 | src1+2 | src1+3 |
540 * |-----------------------------------|
541 * |(x0, x1)|(y0, y1)| | | in SIMD8
542 * |-----------------------------------|
543 * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
544 * -----------------------------------
545 *
546 * See also: emit_interpolation_setup_gen4().
547 */
548 struct brw_reg delta_x = src[0];
549 struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
550 struct brw_reg interp = src[1];
551
552 if (devinfo->has_pln &&
553 (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
554 brw_PLN(p, dst, interp, delta_x);
555 } else {
556 brw_LINE(p, brw_null_reg(), interp, delta_x);
557 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
558 }
559 }
560
561 void
562 fs_generator::generate_get_buffer_size(fs_inst *inst,
563 struct brw_reg dst,
564 struct brw_reg src,
565 struct brw_reg surf_index)
566 {
567 assert(devinfo->gen >= 7);
568 assert(surf_index.file == BRW_IMMEDIATE_VALUE);
569
570 uint32_t simd_mode;
571 int rlen = 4;
572
573 switch (inst->exec_size) {
574 case 8:
575 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
576 break;
577 case 16:
578 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
579 break;
580 default:
581 unreachable("Invalid width for texture instruction");
582 }
583
584 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
585 rlen = 8;
586 dst = vec16(dst);
587 }
588
589 brw_SAMPLE(p,
590 retype(dst, BRW_REGISTER_TYPE_UW),
591 inst->base_mrf,
592 src,
593 surf_index.ud,
594 0,
595 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
596 rlen, /* response length */
597 inst->mlen,
598 inst->header_size > 0,
599 simd_mode,
600 BRW_SAMPLER_RETURN_FORMAT_SINT32);
601
602 brw_mark_surface_used(prog_data, surf_index.ud);
603 }
604
605 void
606 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
607 struct brw_reg surface_index,
608 struct brw_reg sampler_index)
609 {
610 int msg_type = -1;
611 uint32_t simd_mode;
612 uint32_t return_format;
613 bool is_combined_send = inst->eot;
614
615 switch (dst.type) {
616 case BRW_REGISTER_TYPE_D:
617 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
618 break;
619 case BRW_REGISTER_TYPE_UD:
620 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
621 break;
622 default:
623 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
624 break;
625 }
626
627 /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type
628 * is set as part of the message descriptor. On gen4, the PRM seems to
629 * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
630 * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is
631 * gone from the message descriptor entirely and you just get UINT32 all
632 * the time regasrdless. Since we can really only do non-UINT32 on gen4,
633 * just stomp it to UINT32 all the time.
634 */
635 if (inst->opcode == SHADER_OPCODE_TXS)
636 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
637
638 switch (inst->exec_size) {
639 case 8:
640 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
641 break;
642 case 16:
643 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
644 break;
645 default:
646 unreachable("Invalid width for texture instruction");
647 }
648
649 if (devinfo->gen >= 5) {
650 switch (inst->opcode) {
651 case SHADER_OPCODE_TEX:
652 if (inst->shadow_compare) {
653 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
654 } else {
655 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
656 }
657 break;
658 case FS_OPCODE_TXB:
659 if (inst->shadow_compare) {
660 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
661 } else {
662 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
663 }
664 break;
665 case SHADER_OPCODE_TXL:
666 if (inst->shadow_compare) {
667 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
668 } else {
669 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
670 }
671 break;
672 case SHADER_OPCODE_TXL_LZ:
673 assert(devinfo->gen >= 9);
674 if (inst->shadow_compare) {
675 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
676 } else {
677 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
678 }
679 break;
680 case SHADER_OPCODE_TXS:
681 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
682 break;
683 case SHADER_OPCODE_TXD:
684 if (inst->shadow_compare) {
685 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
686 assert(devinfo->gen >= 8 || devinfo->is_haswell);
687 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
688 } else {
689 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
690 }
691 break;
692 case SHADER_OPCODE_TXF:
693 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
694 break;
695 case SHADER_OPCODE_TXF_LZ:
696 assert(devinfo->gen >= 9);
697 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
698 break;
699 case SHADER_OPCODE_TXF_CMS_W:
700 assert(devinfo->gen >= 9);
701 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
702 break;
703 case SHADER_OPCODE_TXF_CMS:
704 if (devinfo->gen >= 7)
705 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
706 else
707 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
708 break;
709 case SHADER_OPCODE_TXF_UMS:
710 assert(devinfo->gen >= 7);
711 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
712 break;
713 case SHADER_OPCODE_TXF_MCS:
714 assert(devinfo->gen >= 7);
715 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
716 break;
717 case SHADER_OPCODE_LOD:
718 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
719 break;
720 case SHADER_OPCODE_TG4:
721 if (inst->shadow_compare) {
722 assert(devinfo->gen >= 7);
723 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
724 } else {
725 assert(devinfo->gen >= 6);
726 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
727 }
728 break;
729 case SHADER_OPCODE_TG4_OFFSET:
730 assert(devinfo->gen >= 7);
731 if (inst->shadow_compare) {
732 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
733 } else {
734 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
735 }
736 break;
737 case SHADER_OPCODE_SAMPLEINFO:
738 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
739 break;
740 default:
741 unreachable("not reached");
742 }
743 } else {
744 switch (inst->opcode) {
745 case SHADER_OPCODE_TEX:
746 /* Note that G45 and older determines shadow compare and dispatch width
747 * from message length for most messages.
748 */
749 if (inst->exec_size == 8) {
750 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
751 if (inst->shadow_compare) {
752 assert(inst->mlen == 6);
753 } else {
754 assert(inst->mlen <= 4);
755 }
756 } else {
757 if (inst->shadow_compare) {
758 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
759 assert(inst->mlen == 9);
760 } else {
761 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
762 assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
763 }
764 }
765 break;
766 case FS_OPCODE_TXB:
767 if (inst->shadow_compare) {
768 assert(inst->exec_size == 8);
769 assert(inst->mlen == 6);
770 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
771 } else {
772 assert(inst->mlen == 9);
773 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
774 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
775 }
776 break;
777 case SHADER_OPCODE_TXL:
778 if (inst->shadow_compare) {
779 assert(inst->exec_size == 8);
780 assert(inst->mlen == 6);
781 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
782 } else {
783 assert(inst->mlen == 9);
784 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
785 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
786 }
787 break;
788 case SHADER_OPCODE_TXD:
789 /* There is no sample_d_c message; comparisons are done manually */
790 assert(inst->exec_size == 8);
791 assert(inst->mlen == 7 || inst->mlen == 10);
792 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
793 break;
794 case SHADER_OPCODE_TXF:
795 assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
796 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
797 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
798 break;
799 case SHADER_OPCODE_TXS:
800 assert(inst->mlen == 3);
801 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
802 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
803 break;
804 default:
805 unreachable("not reached");
806 }
807 }
808 assert(msg_type != -1);
809
810 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
811 dst = vec16(dst);
812 }
813
814 assert(devinfo->gen < 7 || inst->header_size == 0 ||
815 src.file == BRW_GENERAL_REGISTER_FILE);
816
817 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
818
819 /* Load the message header if present. If there's a texture offset,
820 * we need to set it up explicitly and load the offset bitfield.
821 * Otherwise, we can use an implied move from g0 to the first message reg.
822 */
823 if (inst->header_size != 0) {
824 if (devinfo->gen < 6 && !inst->offset) {
825 /* Set up an implied move from g0 to the MRF. */
826 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
827 } else {
828 struct brw_reg header_reg;
829
830 if (devinfo->gen >= 7) {
831 header_reg = src;
832 } else {
833 assert(inst->base_mrf != -1);
834 header_reg = brw_message_reg(inst->base_mrf);
835 }
836
837 brw_push_insn_state(p);
838 brw_set_default_exec_size(p, BRW_EXECUTE_8);
839 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
840 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
841 /* Explicitly set up the message header by copying g0 to the MRF. */
842 brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
843
844 if (inst->offset) {
845 /* Set the offset bits in DWord 2. */
846 brw_MOV(p, get_element_ud(header_reg, 2),
847 brw_imm_ud(inst->offset));
848 } else if (stage != MESA_SHADER_VERTEX &&
849 stage != MESA_SHADER_FRAGMENT) {
850 /* The vertex and fragment stages have g0.2 set to 0, so
851 * header0.2 is 0 when g0 is copied. Other stages may not, so we
852 * must set it to 0 to avoid setting undesirable bits in the
853 * message.
854 */
855 brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
856 }
857
858 brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
859 brw_pop_insn_state(p);
860 }
861 }
862
863 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
864 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
865 ? prog_data->binding_table.gather_texture_start
866 : prog_data->binding_table.texture_start;
867
868 if (surface_index.file == BRW_IMMEDIATE_VALUE &&
869 sampler_index.file == BRW_IMMEDIATE_VALUE) {
870 uint32_t surface = surface_index.ud;
871 uint32_t sampler = sampler_index.ud;
872
873 brw_SAMPLE(p,
874 retype(dst, BRW_REGISTER_TYPE_UW),
875 inst->base_mrf,
876 src,
877 surface + base_binding_table_index,
878 sampler % 16,
879 msg_type,
880 inst->regs_written,
881 inst->mlen,
882 inst->header_size != 0,
883 simd_mode,
884 return_format);
885
886 brw_mark_surface_used(prog_data, surface + base_binding_table_index);
887 } else {
888 /* Non-const sampler index */
889
890 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
891 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
892 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
893
894 brw_push_insn_state(p);
895 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
896 brw_set_default_access_mode(p, BRW_ALIGN_1);
897
898 if (brw_regs_equal(&surface_reg, &sampler_reg)) {
899 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
900 } else {
901 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
902 brw_OR(p, addr, addr, surface_reg);
903 }
904 if (base_binding_table_index)
905 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
906 brw_AND(p, addr, addr, brw_imm_ud(0xfff));
907
908 brw_pop_insn_state(p);
909
910 /* dst = send(offset, a0.0 | <descriptor>) */
911 brw_inst *insn = brw_send_indirect_message(
912 p, BRW_SFID_SAMPLER, dst, src, addr);
913 brw_set_sampler_message(p, insn,
914 0 /* surface */,
915 0 /* sampler */,
916 msg_type,
917 inst->regs_written,
918 inst->mlen /* mlen */,
919 inst->header_size != 0 /* header */,
920 simd_mode,
921 return_format);
922
923 /* visitor knows more than we do about the surface limit required,
924 * so has already done marking.
925 */
926 }
927
928 if (is_combined_send) {
929 brw_inst_set_eot(p->devinfo, brw_last_inst, true);
930 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
931 }
932 }
933
934
935 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
936 * looking like:
937 *
938 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
939 *
940 * Ideally, we want to produce:
941 *
942 * DDX DDY
943 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
944 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
945 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
946 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
947 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
948 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
949 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
950 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
951 *
952 * and add another set of two more subspans if in 16-pixel dispatch mode.
953 *
954 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
955 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
956 * pair. But the ideal approximation may impose a huge performance cost on
957 * sample_d. On at least Haswell, sample_d instruction does some
958 * optimizations if the same LOD is used for all pixels in the subspan.
959 *
960 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
961 * appropriate swizzling.
962 */
963 void
964 fs_generator::generate_ddx(enum opcode opcode,
965 struct brw_reg dst, struct brw_reg src)
966 {
967 unsigned vstride, width;
968
969 if (opcode == FS_OPCODE_DDX_FINE) {
970 /* produce accurate derivatives */
971 vstride = BRW_VERTICAL_STRIDE_2;
972 width = BRW_WIDTH_2;
973 } else {
974 /* replicate the derivative at the top-left pixel to other pixels */
975 vstride = BRW_VERTICAL_STRIDE_4;
976 width = BRW_WIDTH_4;
977 }
978
979 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
980 src.negate, src.abs,
981 BRW_REGISTER_TYPE_F,
982 vstride,
983 width,
984 BRW_HORIZONTAL_STRIDE_0,
985 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
986 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
987 src.negate, src.abs,
988 BRW_REGISTER_TYPE_F,
989 vstride,
990 width,
991 BRW_HORIZONTAL_STRIDE_0,
992 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
993 brw_ADD(p, dst, src0, negate(src1));
994 }
995
996 /* The negate_value boolean is used to negate the derivative computation for
997 * FBOs, since they place the origin at the upper left instead of the lower
998 * left.
999 */
1000 void
1001 fs_generator::generate_ddy(enum opcode opcode,
1002 struct brw_reg dst, struct brw_reg src)
1003 {
1004 if (opcode == FS_OPCODE_DDY_FINE) {
1005 /* produce accurate derivatives */
1006 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1007 src.negate, src.abs,
1008 BRW_REGISTER_TYPE_F,
1009 BRW_VERTICAL_STRIDE_4,
1010 BRW_WIDTH_4,
1011 BRW_HORIZONTAL_STRIDE_1,
1012 BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
1013 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1014 src.negate, src.abs,
1015 BRW_REGISTER_TYPE_F,
1016 BRW_VERTICAL_STRIDE_4,
1017 BRW_WIDTH_4,
1018 BRW_HORIZONTAL_STRIDE_1,
1019 BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
1020 brw_push_insn_state(p);
1021 brw_set_default_access_mode(p, BRW_ALIGN_16);
1022 brw_ADD(p, dst, negate(src0), src1);
1023 brw_pop_insn_state(p);
1024 } else {
1025 /* replicate the derivative at the top-left pixel to other pixels */
1026 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1027 src.negate, src.abs,
1028 BRW_REGISTER_TYPE_F,
1029 BRW_VERTICAL_STRIDE_4,
1030 BRW_WIDTH_4,
1031 BRW_HORIZONTAL_STRIDE_0,
1032 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1033 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1034 src.negate, src.abs,
1035 BRW_REGISTER_TYPE_F,
1036 BRW_VERTICAL_STRIDE_4,
1037 BRW_WIDTH_4,
1038 BRW_HORIZONTAL_STRIDE_0,
1039 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1040 brw_ADD(p, dst, negate(src0), src1);
1041 }
1042 }
1043
1044 void
1045 fs_generator::generate_discard_jump(fs_inst *inst)
1046 {
1047 assert(devinfo->gen >= 6);
1048
1049 /* This HALT will be patched up at FB write time to point UIP at the end of
1050 * the program, and at brw_uip_jip() JIP will be set to the end of the
1051 * current block (or the program).
1052 */
1053 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1054
1055 brw_push_insn_state(p);
1056 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1057 gen6_HALT(p);
1058 brw_pop_insn_state(p);
1059 }
1060
1061 void
1062 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1063 {
1064 /* The 32-wide messages only respect the first 16-wide half of the channel
1065 * enable signals which are replicated identically for the second group of
1066 * 16 channels, so we cannot use them unless the write is marked
1067 * force_writemask_all.
1068 */
1069 const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1070 MIN2(16, inst->exec_size);
1071 const unsigned block_size = 4 * lower_size / REG_SIZE;
1072 assert(inst->mlen != 0);
1073
1074 brw_push_insn_state(p);
1075 brw_set_default_exec_size(p, cvt(lower_size) - 1);
1076 brw_set_default_compression(p, lower_size > 8);
1077
1078 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1079 brw_set_default_group(p, (inst->force_sechalf ? 8 : 0) + lower_size * i);
1080
1081 brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1082 retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1083
1084 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1085 block_size,
1086 inst->offset + block_size * REG_SIZE * i);
1087 }
1088
1089 brw_pop_insn_state(p);
1090 }
1091
1092 void
1093 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1094 {
1095 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1096 assert(inst->mlen != 0);
1097
1098 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1099 inst->exec_size / 8, inst->offset);
1100 }
1101
1102 void
1103 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1104 {
1105 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1106
1107 gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1108 }
1109
1110 void
1111 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1112 struct brw_reg dst,
1113 struct brw_reg index,
1114 struct brw_reg offset)
1115 {
1116 assert(inst->mlen != 0);
1117
1118 assert(index.file == BRW_IMMEDIATE_VALUE &&
1119 index.type == BRW_REGISTER_TYPE_UD);
1120 uint32_t surf_index = index.ud;
1121
1122 assert(offset.file == BRW_IMMEDIATE_VALUE &&
1123 offset.type == BRW_REGISTER_TYPE_UD);
1124 uint32_t read_offset = offset.ud;
1125
1126 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1127 read_offset, surf_index);
1128 }
1129
1130 void
1131 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1132 struct brw_reg dst,
1133 struct brw_reg index,
1134 struct brw_reg offset)
1135 {
1136 assert(index.type == BRW_REGISTER_TYPE_UD);
1137
1138 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
1139 /* Reference just the dword we need, to avoid angering validate_reg(). */
1140 offset = brw_vec1_grf(offset.nr, 0);
1141
1142 /* We use the SIMD4x2 mode because we want to end up with 4 components in
1143 * the destination loaded consecutively from the same offset (which appears
1144 * in the first component, and the rest are ignored).
1145 */
1146 dst.width = BRW_WIDTH_4;
1147
1148 struct brw_reg src = offset;
1149 bool header_present = false;
1150
1151 if (devinfo->gen >= 9) {
1152 /* Skylake requires a message header in order to use SIMD4x2 mode. */
1153 src = retype(brw_vec4_grf(offset.nr, 0), BRW_REGISTER_TYPE_UD);
1154 header_present = true;
1155
1156 brw_push_insn_state(p);
1157 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1158 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1159 brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1160 brw_set_default_access_mode(p, BRW_ALIGN_1);
1161
1162 brw_MOV(p, get_element_ud(src, 2),
1163 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1164 brw_pop_insn_state(p);
1165 }
1166
1167 if (index.file == BRW_IMMEDIATE_VALUE) {
1168
1169 uint32_t surf_index = index.ud;
1170
1171 brw_push_insn_state(p);
1172 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1173 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1174 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1175 brw_inst_set_exec_size(devinfo, send, BRW_EXECUTE_4);
1176 brw_pop_insn_state(p);
1177
1178 brw_set_dest(p, send, dst);
1179 brw_set_src0(p, send, src);
1180 brw_set_sampler_message(p, send,
1181 surf_index,
1182 0, /* LD message ignores sampler unit */
1183 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1184 1, /* rlen */
1185 inst->mlen,
1186 header_present,
1187 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1188 0);
1189 } else {
1190
1191 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1192
1193 brw_push_insn_state(p);
1194 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1195 brw_set_default_access_mode(p, BRW_ALIGN_1);
1196
1197 /* a0.0 = surf_index & 0xff */
1198 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1199 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1200 brw_set_dest(p, insn_and, addr);
1201 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1202 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1203
1204 /* dst = send(payload, a0.0 | <descriptor>) */
1205 brw_inst *insn = brw_send_indirect_message(
1206 p, BRW_SFID_SAMPLER, dst, src, addr);
1207 brw_set_sampler_message(p, insn,
1208 0,
1209 0, /* LD message ignores sampler unit */
1210 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1211 1, /* rlen */
1212 inst->mlen,
1213 header_present,
1214 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1215 0);
1216
1217 brw_pop_insn_state(p);
1218 }
1219 }
1220
1221 void
1222 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
1223 struct brw_reg dst,
1224 struct brw_reg index)
1225 {
1226 assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1227 assert(inst->header_size != 0);
1228 assert(inst->mlen);
1229
1230 assert(index.file == BRW_IMMEDIATE_VALUE &&
1231 index.type == BRW_REGISTER_TYPE_UD);
1232 uint32_t surf_index = index.ud;
1233
1234 uint32_t simd_mode, rlen, msg_type;
1235 if (inst->exec_size == 16) {
1236 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1237 rlen = 8;
1238 } else {
1239 assert(inst->exec_size == 8);
1240 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1241 rlen = 4;
1242 }
1243
1244 if (devinfo->gen >= 5)
1245 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1246 else {
1247 /* We always use the SIMD16 message so that we only have to load U, and
1248 * not V or R.
1249 */
1250 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1251 assert(inst->mlen == 3);
1252 assert(inst->regs_written == 8);
1253 rlen = 8;
1254 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1255 }
1256
1257 struct brw_reg header = brw_vec8_grf(0, 0);
1258 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1259
1260 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1261 brw_inst_set_compression(devinfo, send, false);
1262 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1263 brw_set_src0(p, send, header);
1264 if (devinfo->gen < 6)
1265 brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1266
1267 /* Our surface is set up as floats, regardless of what actual data is
1268 * stored in it.
1269 */
1270 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1271 brw_set_sampler_message(p, send,
1272 surf_index,
1273 0, /* sampler (unused) */
1274 msg_type,
1275 rlen,
1276 inst->mlen,
1277 inst->header_size != 0,
1278 simd_mode,
1279 return_format);
1280 }
1281
1282 void
1283 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1284 struct brw_reg dst,
1285 struct brw_reg index,
1286 struct brw_reg offset)
1287 {
1288 assert(devinfo->gen >= 7);
1289 /* Varying-offset pull constant loads are treated as a normal expression on
1290 * gen7, so the fact that it's a send message is hidden at the IR level.
1291 */
1292 assert(inst->header_size == 0);
1293 assert(!inst->mlen);
1294 assert(index.type == BRW_REGISTER_TYPE_UD);
1295
1296 uint32_t simd_mode, rlen, mlen;
1297 if (inst->exec_size == 16) {
1298 mlen = 2;
1299 rlen = 8;
1300 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1301 } else {
1302 assert(inst->exec_size == 8);
1303 mlen = 1;
1304 rlen = 4;
1305 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1306 }
1307
1308 if (index.file == BRW_IMMEDIATE_VALUE) {
1309
1310 uint32_t surf_index = index.ud;
1311
1312 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1313 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1314 brw_set_src0(p, send, offset);
1315 brw_set_sampler_message(p, send,
1316 surf_index,
1317 0, /* LD message ignores sampler unit */
1318 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1319 rlen,
1320 mlen,
1321 false, /* no header */
1322 simd_mode,
1323 0);
1324
1325 } else {
1326
1327 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1328
1329 brw_push_insn_state(p);
1330 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1331 brw_set_default_access_mode(p, BRW_ALIGN_1);
1332
1333 /* a0.0 = surf_index & 0xff */
1334 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1335 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1336 brw_set_dest(p, insn_and, addr);
1337 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1338 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1339
1340 brw_pop_insn_state(p);
1341
1342 /* dst = send(offset, a0.0 | <descriptor>) */
1343 brw_inst *insn = brw_send_indirect_message(
1344 p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
1345 offset, addr);
1346 brw_set_sampler_message(p, insn,
1347 0 /* surface */,
1348 0 /* sampler */,
1349 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1350 rlen /* rlen */,
1351 mlen /* mlen */,
1352 false /* header */,
1353 simd_mode,
1354 0);
1355 }
1356 }
1357
1358 /**
1359 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1360 * into the flags register (f0.0).
1361 *
1362 * Used only on Gen6 and above.
1363 */
1364 void
1365 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1366 {
1367 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1368 struct brw_reg dispatch_mask;
1369
1370 if (devinfo->gen >= 6)
1371 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1372 else
1373 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1374
1375 brw_push_insn_state(p);
1376 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1377 brw_MOV(p, flags, dispatch_mask);
1378 brw_pop_insn_state(p);
1379 }
1380
1381 void
1382 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1383 struct brw_reg dst,
1384 struct brw_reg src,
1385 struct brw_reg msg_data,
1386 unsigned msg_type)
1387 {
1388 assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1389
1390 brw_pixel_interpolator_query(p,
1391 retype(dst, BRW_REGISTER_TYPE_UW),
1392 src,
1393 inst->pi_noperspective,
1394 msg_type,
1395 msg_data,
1396 inst->mlen,
1397 inst->regs_written);
1398 }
1399
1400
1401 /**
1402 * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
1403 * sampler LD messages.
1404 *
1405 * We don't want to bake it into the send message's code generation because
1406 * that means we don't get a chance to schedule the instructions.
1407 */
1408 void
1409 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
1410 struct brw_reg dst,
1411 struct brw_reg value)
1412 {
1413 assert(value.file == BRW_IMMEDIATE_VALUE);
1414
1415 brw_push_insn_state(p);
1416 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1417 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1418 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1419 brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
1420 brw_pop_insn_state(p);
1421 }
1422
1423 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1424 * the ADD instruction.
1425 */
1426 void
1427 fs_generator::generate_set_sample_id(fs_inst *inst,
1428 struct brw_reg dst,
1429 struct brw_reg src0,
1430 struct brw_reg src1)
1431 {
1432 assert(dst.type == BRW_REGISTER_TYPE_D ||
1433 dst.type == BRW_REGISTER_TYPE_UD);
1434 assert(src0.type == BRW_REGISTER_TYPE_D ||
1435 src0.type == BRW_REGISTER_TYPE_UD);
1436
1437 struct brw_reg reg = stride(src1, 1, 4, 0);
1438 if (devinfo->gen >= 8 || inst->exec_size == 8) {
1439 brw_ADD(p, dst, src0, reg);
1440 } else if (inst->exec_size == 16) {
1441 brw_push_insn_state(p);
1442 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1443 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1444 brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1445 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1446 brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1447 brw_pop_insn_state(p);
1448 }
1449 }
1450
1451 void
1452 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1453 struct brw_reg dst,
1454 struct brw_reg x,
1455 struct brw_reg y)
1456 {
1457 assert(devinfo->gen >= 7);
1458 assert(dst.type == BRW_REGISTER_TYPE_UD);
1459 assert(x.type == BRW_REGISTER_TYPE_F);
1460 assert(y.type == BRW_REGISTER_TYPE_F);
1461
1462 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1463 *
1464 * Because this instruction does not have a 16-bit floating-point type,
1465 * the destination data type must be Word (W).
1466 *
1467 * The destination must be DWord-aligned and specify a horizontal stride
1468 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
1469 * each destination channel and the upper word is not modified.
1470 */
1471 struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1472
1473 /* Give each 32-bit channel of dst the form below, where "." means
1474 * unchanged.
1475 * 0x....hhhh
1476 */
1477 brw_F32TO16(p, dst_w, y);
1478
1479 /* Now the form:
1480 * 0xhhhh0000
1481 */
1482 brw_SHL(p, dst, dst, brw_imm_ud(16u));
1483
1484 /* And, finally the form of packHalf2x16's output:
1485 * 0xhhhhllll
1486 */
1487 brw_F32TO16(p, dst_w, x);
1488 }
1489
1490 void
1491 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1492 struct brw_reg dst,
1493 struct brw_reg src)
1494 {
1495 assert(devinfo->gen >= 7);
1496 assert(dst.type == BRW_REGISTER_TYPE_F);
1497 assert(src.type == BRW_REGISTER_TYPE_UD);
1498
1499 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1500 *
1501 * Because this instruction does not have a 16-bit floating-point type,
1502 * the source data type must be Word (W). The destination type must be
1503 * F (Float).
1504 */
1505 struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1506
1507 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1508 * For the Y case, we wish to access only the upper word; therefore
1509 * a 16-bit subregister offset is needed.
1510 */
1511 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1512 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1513 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1514 src_w.subnr += 2;
1515
1516 brw_F16TO32(p, dst, src_w);
1517 }
1518
1519 void
1520 fs_generator::generate_shader_time_add(fs_inst *inst,
1521 struct brw_reg payload,
1522 struct brw_reg offset,
1523 struct brw_reg value)
1524 {
1525 assert(devinfo->gen >= 7);
1526 brw_push_insn_state(p);
1527 brw_set_default_mask_control(p, true);
1528
1529 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1530 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1531 offset.type);
1532 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1533 value.type);
1534
1535 assert(offset.file == BRW_IMMEDIATE_VALUE);
1536 if (value.file == BRW_GENERAL_REGISTER_FILE) {
1537 value.width = BRW_WIDTH_1;
1538 value.hstride = BRW_HORIZONTAL_STRIDE_0;
1539 value.vstride = BRW_VERTICAL_STRIDE_0;
1540 } else {
1541 assert(value.file == BRW_IMMEDIATE_VALUE);
1542 }
1543
1544 /* Trying to deal with setup of the params from the IR is crazy in the FS8
1545 * case, and we don't really care about squeezing every bit of performance
1546 * out of this path, so we just emit the MOVs from here.
1547 */
1548 brw_MOV(p, payload_offset, offset);
1549 brw_MOV(p, payload_value, value);
1550 brw_shader_time_add(p, payload,
1551 prog_data->binding_table.shader_time_start);
1552 brw_pop_insn_state(p);
1553
1554 brw_mark_surface_used(prog_data,
1555 prog_data->binding_table.shader_time_start);
1556 }
1557
1558 void
1559 fs_generator::enable_debug(const char *shader_name)
1560 {
1561 debug_flag = true;
1562 this->shader_name = shader_name;
1563 }
1564
1565 int
1566 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1567 {
1568 /* align to 64 byte boundary. */
1569 while (p->next_insn_offset % 64)
1570 brw_NOP(p);
1571
1572 this->dispatch_width = dispatch_width;
1573
1574 int start_offset = p->next_insn_offset;
1575 int spill_count = 0, fill_count = 0;
1576 int loop_count = 0;
1577
1578 struct annotation_info annotation;
1579 memset(&annotation, 0, sizeof(annotation));
1580
1581 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1582 struct brw_reg src[3], dst;
1583 unsigned int last_insn_offset = p->next_insn_offset;
1584 bool multiple_instructions_emitted = false;
1585
1586 /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1587 * "Register Region Restrictions" section: for BDW, SKL:
1588 *
1589 * "A POW/FDIV operation must not be followed by an instruction
1590 * that requires two destination registers."
1591 *
1592 * The documentation is often lacking annotations for Atom parts,
1593 * and empirically this affects CHV as well.
1594 */
1595 if (devinfo->gen >= 8 &&
1596 p->nr_insn > 1 &&
1597 brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1598 brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1599 inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1600 brw_NOP(p);
1601 last_insn_offset = p->next_insn_offset;
1602 }
1603
1604 if (unlikely(debug_flag))
1605 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
1606
1607 /* If the instruction writes to more than one register, it needs to be
1608 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
1609 * hardware figures out by itself what the right compression mode is,
1610 * but we still need to know whether the instruction is compressed to
1611 * set up the source register regions appropriately.
1612 *
1613 * XXX - This is wrong for instructions that write a single register but
1614 * read more than one which should strictly speaking be treated as
1615 * compressed. For instructions that don't write any registers it
1616 * relies on the destination being a null register of the correct
1617 * type and regioning so the instruction is considered compressed
1618 * or not accordingly.
1619 */
1620 const bool compressed =
1621 inst->dst.component_size(inst->exec_size) > REG_SIZE;
1622 brw_set_default_compression(p, compressed);
1623 brw_set_default_group(p, inst->force_sechalf ? 8 : 0);
1624
1625 for (unsigned int i = 0; i < inst->sources; i++) {
1626 src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen,
1627 compressed);
1628
1629 /* The accumulator result appears to get used for the
1630 * conditional modifier generation. When negating a UD
1631 * value, there is a 33rd bit generated for the sign in the
1632 * accumulator value, so now you can't check, for example,
1633 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
1634 */
1635 assert(!inst->conditional_mod ||
1636 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1637 !inst->src[i].negate);
1638 }
1639 dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen, compressed);
1640
1641 brw_set_default_access_mode(p, BRW_ALIGN_1);
1642 brw_set_default_predicate_control(p, inst->predicate);
1643 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1644 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1645 brw_set_default_saturate(p, inst->saturate);
1646 brw_set_default_mask_control(p, inst->force_writemask_all);
1647 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1648 brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
1649
1650 assert(inst->force_writemask_all || inst->exec_size >= 8);
1651 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1652 assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1653
1654 switch (inst->opcode) {
1655 case BRW_OPCODE_MOV:
1656 brw_MOV(p, dst, src[0]);
1657 break;
1658 case BRW_OPCODE_ADD:
1659 brw_ADD(p, dst, src[0], src[1]);
1660 break;
1661 case BRW_OPCODE_MUL:
1662 brw_MUL(p, dst, src[0], src[1]);
1663 break;
1664 case BRW_OPCODE_AVG:
1665 brw_AVG(p, dst, src[0], src[1]);
1666 break;
1667 case BRW_OPCODE_MACH:
1668 brw_MACH(p, dst, src[0], src[1]);
1669 break;
1670
1671 case BRW_OPCODE_LINE:
1672 brw_LINE(p, dst, src[0], src[1]);
1673 break;
1674
1675 case BRW_OPCODE_MAD:
1676 assert(devinfo->gen >= 6);
1677 brw_set_default_access_mode(p, BRW_ALIGN_16);
1678 brw_MAD(p, dst, src[0], src[1], src[2]);
1679 break;
1680
1681 case BRW_OPCODE_LRP:
1682 assert(devinfo->gen >= 6);
1683 brw_set_default_access_mode(p, BRW_ALIGN_16);
1684 brw_LRP(p, dst, src[0], src[1], src[2]);
1685 break;
1686
1687 case BRW_OPCODE_FRC:
1688 brw_FRC(p, dst, src[0]);
1689 break;
1690 case BRW_OPCODE_RNDD:
1691 brw_RNDD(p, dst, src[0]);
1692 break;
1693 case BRW_OPCODE_RNDE:
1694 brw_RNDE(p, dst, src[0]);
1695 break;
1696 case BRW_OPCODE_RNDZ:
1697 brw_RNDZ(p, dst, src[0]);
1698 break;
1699
1700 case BRW_OPCODE_AND:
1701 brw_AND(p, dst, src[0], src[1]);
1702 break;
1703 case BRW_OPCODE_OR:
1704 brw_OR(p, dst, src[0], src[1]);
1705 break;
1706 case BRW_OPCODE_XOR:
1707 brw_XOR(p, dst, src[0], src[1]);
1708 break;
1709 case BRW_OPCODE_NOT:
1710 brw_NOT(p, dst, src[0]);
1711 break;
1712 case BRW_OPCODE_ASR:
1713 brw_ASR(p, dst, src[0], src[1]);
1714 break;
1715 case BRW_OPCODE_SHR:
1716 brw_SHR(p, dst, src[0], src[1]);
1717 break;
1718 case BRW_OPCODE_SHL:
1719 brw_SHL(p, dst, src[0], src[1]);
1720 break;
1721 case BRW_OPCODE_F32TO16:
1722 assert(devinfo->gen >= 7);
1723 brw_F32TO16(p, dst, src[0]);
1724 break;
1725 case BRW_OPCODE_F16TO32:
1726 assert(devinfo->gen >= 7);
1727 brw_F16TO32(p, dst, src[0]);
1728 break;
1729 case BRW_OPCODE_CMP:
1730 if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
1731 dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1732 /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
1733 * implemented in the compiler is not sufficient. Overriding the
1734 * type when the destination is the null register is necessary but
1735 * not sufficient by itself.
1736 */
1737 assert(dst.nr == BRW_ARF_NULL);
1738 dst.type = BRW_REGISTER_TYPE_D;
1739 }
1740 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1741 break;
1742 case BRW_OPCODE_SEL:
1743 brw_SEL(p, dst, src[0], src[1]);
1744 break;
1745 case BRW_OPCODE_BFREV:
1746 assert(devinfo->gen >= 7);
1747 /* BFREV only supports UD type for src and dst. */
1748 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1749 retype(src[0], BRW_REGISTER_TYPE_UD));
1750 break;
1751 case BRW_OPCODE_FBH:
1752 assert(devinfo->gen >= 7);
1753 /* FBH only supports UD type for dst. */
1754 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1755 break;
1756 case BRW_OPCODE_FBL:
1757 assert(devinfo->gen >= 7);
1758 /* FBL only supports UD type for dst. */
1759 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1760 break;
1761 case BRW_OPCODE_CBIT:
1762 assert(devinfo->gen >= 7);
1763 /* CBIT only supports UD type for dst. */
1764 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1765 break;
1766 case BRW_OPCODE_ADDC:
1767 assert(devinfo->gen >= 7);
1768 brw_ADDC(p, dst, src[0], src[1]);
1769 break;
1770 case BRW_OPCODE_SUBB:
1771 assert(devinfo->gen >= 7);
1772 brw_SUBB(p, dst, src[0], src[1]);
1773 break;
1774 case BRW_OPCODE_MAC:
1775 brw_MAC(p, dst, src[0], src[1]);
1776 break;
1777
1778 case BRW_OPCODE_BFE:
1779 assert(devinfo->gen >= 7);
1780 brw_set_default_access_mode(p, BRW_ALIGN_16);
1781 brw_BFE(p, dst, src[0], src[1], src[2]);
1782 break;
1783
1784 case BRW_OPCODE_BFI1:
1785 assert(devinfo->gen >= 7);
1786 brw_BFI1(p, dst, src[0], src[1]);
1787 break;
1788 case BRW_OPCODE_BFI2:
1789 assert(devinfo->gen >= 7);
1790 brw_set_default_access_mode(p, BRW_ALIGN_16);
1791 brw_BFI2(p, dst, src[0], src[1], src[2]);
1792 break;
1793
1794 case BRW_OPCODE_IF:
1795 if (inst->src[0].file != BAD_FILE) {
1796 /* The instruction has an embedded compare (only allowed on gen6) */
1797 assert(devinfo->gen == 6);
1798 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1799 } else {
1800 brw_IF(p, brw_inst_exec_size(devinfo, p->current));
1801 }
1802 break;
1803
1804 case BRW_OPCODE_ELSE:
1805 brw_ELSE(p);
1806 break;
1807 case BRW_OPCODE_ENDIF:
1808 brw_ENDIF(p);
1809 break;
1810
1811 case BRW_OPCODE_DO:
1812 brw_DO(p, brw_inst_exec_size(devinfo, p->current));
1813 break;
1814
1815 case BRW_OPCODE_BREAK:
1816 brw_BREAK(p);
1817 break;
1818 case BRW_OPCODE_CONTINUE:
1819 brw_CONT(p);
1820 break;
1821
1822 case BRW_OPCODE_WHILE:
1823 brw_WHILE(p);
1824 loop_count++;
1825 break;
1826
1827 case SHADER_OPCODE_RCP:
1828 case SHADER_OPCODE_RSQ:
1829 case SHADER_OPCODE_SQRT:
1830 case SHADER_OPCODE_EXP2:
1831 case SHADER_OPCODE_LOG2:
1832 case SHADER_OPCODE_SIN:
1833 case SHADER_OPCODE_COS:
1834 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1835 if (devinfo->gen >= 6) {
1836 assert(inst->mlen == 0);
1837 assert(devinfo->gen >= 7 || inst->exec_size == 8);
1838 gen6_math(p, dst, brw_math_function(inst->opcode),
1839 src[0], brw_null_reg());
1840 } else {
1841 assert(inst->mlen >= 1);
1842 assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
1843 gen4_math(p, dst,
1844 brw_math_function(inst->opcode),
1845 inst->base_mrf, src[0],
1846 BRW_MATH_PRECISION_FULL);
1847 }
1848 break;
1849 case SHADER_OPCODE_INT_QUOTIENT:
1850 case SHADER_OPCODE_INT_REMAINDER:
1851 case SHADER_OPCODE_POW:
1852 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1853 if (devinfo->gen >= 6) {
1854 assert(inst->mlen == 0);
1855 assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
1856 inst->exec_size == 8);
1857 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1858 } else {
1859 assert(inst->mlen >= 1);
1860 assert(inst->exec_size == 8);
1861 gen4_math(p, dst, brw_math_function(inst->opcode),
1862 inst->base_mrf, src[0],
1863 BRW_MATH_PRECISION_FULL);
1864 }
1865 break;
1866 case FS_OPCODE_CINTERP:
1867 brw_MOV(p, dst, src[0]);
1868 break;
1869 case FS_OPCODE_LINTERP:
1870 generate_linterp(inst, dst, src);
1871 break;
1872 case FS_OPCODE_PIXEL_X:
1873 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1874 src[0].subnr = 0 * type_sz(src[0].type);
1875 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1876 break;
1877 case FS_OPCODE_PIXEL_Y:
1878 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1879 src[0].subnr = 4 * type_sz(src[0].type);
1880 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1881 break;
1882 case FS_OPCODE_GET_BUFFER_SIZE:
1883 generate_get_buffer_size(inst, dst, src[0], src[1]);
1884 break;
1885 case SHADER_OPCODE_TEX:
1886 case FS_OPCODE_TXB:
1887 case SHADER_OPCODE_TXD:
1888 case SHADER_OPCODE_TXF:
1889 case SHADER_OPCODE_TXF_LZ:
1890 case SHADER_OPCODE_TXF_CMS:
1891 case SHADER_OPCODE_TXF_CMS_W:
1892 case SHADER_OPCODE_TXF_UMS:
1893 case SHADER_OPCODE_TXF_MCS:
1894 case SHADER_OPCODE_TXL:
1895 case SHADER_OPCODE_TXL_LZ:
1896 case SHADER_OPCODE_TXS:
1897 case SHADER_OPCODE_LOD:
1898 case SHADER_OPCODE_TG4:
1899 case SHADER_OPCODE_TG4_OFFSET:
1900 case SHADER_OPCODE_SAMPLEINFO:
1901 generate_tex(inst, dst, src[0], src[1], src[2]);
1902 break;
1903 case FS_OPCODE_DDX_COARSE:
1904 case FS_OPCODE_DDX_FINE:
1905 generate_ddx(inst->opcode, dst, src[0]);
1906 break;
1907 case FS_OPCODE_DDY_COARSE:
1908 case FS_OPCODE_DDY_FINE:
1909 generate_ddy(inst->opcode, dst, src[0]);
1910 break;
1911
1912 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1913 generate_scratch_write(inst, src[0]);
1914 spill_count++;
1915 break;
1916
1917 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1918 generate_scratch_read(inst, dst);
1919 fill_count++;
1920 break;
1921
1922 case SHADER_OPCODE_GEN7_SCRATCH_READ:
1923 generate_scratch_read_gen7(inst, dst);
1924 fill_count++;
1925 break;
1926
1927 case SHADER_OPCODE_MOV_INDIRECT:
1928 generate_mov_indirect(inst, dst, src[0], src[1]);
1929 break;
1930
1931 case SHADER_OPCODE_URB_READ_SIMD8:
1932 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
1933 generate_urb_read(inst, dst, src[0]);
1934 break;
1935
1936 case SHADER_OPCODE_URB_WRITE_SIMD8:
1937 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
1938 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
1939 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
1940 generate_urb_write(inst, src[0]);
1941 break;
1942
1943 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1944 assert(inst->force_writemask_all);
1945 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1946 break;
1947
1948 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1949 assert(inst->force_writemask_all);
1950 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1951 break;
1952
1953 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
1954 generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
1955 break;
1956
1957 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1958 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1959 break;
1960
1961 case FS_OPCODE_REP_FB_WRITE:
1962 case FS_OPCODE_FB_WRITE:
1963 generate_fb_write(inst, src[0]);
1964 break;
1965
1966 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1967 generate_mov_dispatch_to_flags(inst);
1968 break;
1969
1970 case FS_OPCODE_DISCARD_JUMP:
1971 generate_discard_jump(inst);
1972 break;
1973
1974 case SHADER_OPCODE_SHADER_TIME_ADD:
1975 generate_shader_time_add(inst, src[0], src[1], src[2]);
1976 break;
1977
1978 case SHADER_OPCODE_UNTYPED_ATOMIC:
1979 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1980 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
1981 inst->mlen, !inst->dst.is_null());
1982 break;
1983
1984 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1985 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1986 brw_untyped_surface_read(p, dst, src[0], src[1],
1987 inst->mlen, src[2].ud);
1988 break;
1989
1990 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1991 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1992 brw_untyped_surface_write(p, src[0], src[1],
1993 inst->mlen, src[2].ud);
1994 break;
1995
1996 case SHADER_OPCODE_TYPED_ATOMIC:
1997 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1998 brw_typed_atomic(p, dst, src[0], src[1],
1999 src[2].ud, inst->mlen, !inst->dst.is_null());
2000 break;
2001
2002 case SHADER_OPCODE_TYPED_SURFACE_READ:
2003 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2004 brw_typed_surface_read(p, dst, src[0], src[1],
2005 inst->mlen, src[2].ud);
2006 break;
2007
2008 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
2009 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2010 brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
2011 break;
2012
2013 case SHADER_OPCODE_MEMORY_FENCE:
2014 brw_memory_fence(p, dst);
2015 break;
2016
2017 case FS_OPCODE_SET_SIMD4X2_OFFSET:
2018 generate_set_simd4x2_offset(inst, dst, src[0]);
2019 break;
2020
2021 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2022 brw_find_live_channel(p, dst);
2023 break;
2024
2025 case SHADER_OPCODE_BROADCAST:
2026 brw_broadcast(p, dst, src[0], src[1]);
2027 break;
2028
2029 case FS_OPCODE_SET_SAMPLE_ID:
2030 generate_set_sample_id(inst, dst, src[0], src[1]);
2031 break;
2032
2033 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2034 generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2035 break;
2036
2037 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2038 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2039 generate_unpack_half_2x16_split(inst, dst, src[0]);
2040 break;
2041
2042 case FS_OPCODE_PLACEHOLDER_HALT:
2043 /* This is the place where the final HALT needs to be inserted if
2044 * we've emitted any discards. If not, this will emit no code.
2045 */
2046 if (!patch_discard_jumps_to_fb_writes()) {
2047 if (unlikely(debug_flag)) {
2048 annotation.ann_count--;
2049 }
2050 }
2051 break;
2052
2053 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
2054 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2055 GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID);
2056 break;
2057
2058 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2059 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2060 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2061 break;
2062
2063 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2064 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2065 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2066 break;
2067
2068 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2069 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2070 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2071 break;
2072
2073 case CS_OPCODE_CS_TERMINATE:
2074 generate_cs_terminate(inst, src[0]);
2075 break;
2076
2077 case SHADER_OPCODE_BARRIER:
2078 generate_barrier(inst, src[0]);
2079 break;
2080
2081 default:
2082 unreachable("Unsupported opcode");
2083
2084 case SHADER_OPCODE_LOAD_PAYLOAD:
2085 unreachable("Should be lowered by lower_load_payload()");
2086 }
2087
2088 if (multiple_instructions_emitted)
2089 continue;
2090
2091 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2092 assert(p->next_insn_offset == last_insn_offset + 16 ||
2093 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2094 "emitting more than 1 instruction");
2095
2096 brw_inst *last = &p->store[last_insn_offset / 16];
2097
2098 if (inst->conditional_mod)
2099 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2100 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2101 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2102 }
2103 }
2104
2105 brw_set_uip_jip(p);
2106 annotation_finalize(&annotation, p->next_insn_offset);
2107
2108 #ifndef NDEBUG
2109 bool validated = brw_validate_instructions(p, start_offset, &annotation);
2110 #else
2111 if (unlikely(debug_flag))
2112 brw_validate_instructions(p, start_offset, &annotation);
2113 #endif
2114
2115 int before_size = p->next_insn_offset - start_offset;
2116 brw_compact_instructions(p, start_offset, annotation.ann_count,
2117 annotation.ann);
2118 int after_size = p->next_insn_offset - start_offset;
2119
2120 if (unlikely(debug_flag)) {
2121 fprintf(stderr, "Native code for %s\n"
2122 "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
2123 " bytes (%.0f%%)\n",
2124 shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
2125 spill_count, fill_count, promoted_constants, before_size, after_size,
2126 100.0f * (before_size - after_size) / before_size);
2127
2128 dump_assembly(p->store, annotation.ann_count, annotation.ann,
2129 p->devinfo);
2130 ralloc_free(annotation.mem_ctx);
2131 }
2132 assert(validated);
2133
2134 compiler->shader_debug_log(log_data,
2135 "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2136 "%d:%d spills:fills, Promoted %u constants, "
2137 "compacted %d to %d bytes.",
2138 _mesa_shader_stage_to_abbrev(stage),
2139 dispatch_width, before_size / 16,
2140 loop_count, cfg->cycle_count, spill_count,
2141 fill_count, promoted_constants, before_size,
2142 after_size);
2143
2144 return start_offset;
2145 }
2146
2147 const unsigned *
2148 fs_generator::get_assembly(unsigned int *assembly_size)
2149 {
2150 return brw_get_program(p, assembly_size);
2151 }