intel/fs: Don't use automatic exec size inference
[mesa.git] / src / intel / compiler / brw_fs_generator.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_generator.cpp
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32 #include "brw_cfg.h"
33
34 static enum brw_reg_file
35 brw_file_from_reg(fs_reg *reg)
36 {
37 switch (reg->file) {
38 case ARF:
39 return BRW_ARCHITECTURE_REGISTER_FILE;
40 case FIXED_GRF:
41 case VGRF:
42 return BRW_GENERAL_REGISTER_FILE;
43 case MRF:
44 return BRW_MESSAGE_REGISTER_FILE;
45 case IMM:
46 return BRW_IMMEDIATE_VALUE;
47 case BAD_FILE:
48 case ATTR:
49 case UNIFORM:
50 unreachable("not reached");
51 }
52 return BRW_ARCHITECTURE_REGISTER_FILE;
53 }
54
55 static struct brw_reg
56 brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
57 fs_reg *reg, bool compressed)
58 {
59 struct brw_reg brw_reg;
60
61 switch (reg->file) {
62 case MRF:
63 assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
64 /* Fallthrough */
65 case VGRF:
66 if (reg->stride == 0) {
67 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
68 } else {
69 /* From the Haswell PRM:
70 *
71 * "VertStride must be used to cross GRF register boundaries. This
72 * rule implies that elements within a 'Width' cannot cross GRF
73 * boundaries."
74 *
75 * The maximum width value that could satisfy this restriction is:
76 */
77 const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
78
79 /* Because the hardware can only split source regions at a whole
80 * multiple of width during decompression (i.e. vertically), clamp
81 * the value obtained above to the physical execution size of a
82 * single decompressed chunk of the instruction:
83 */
84 const unsigned phys_width = compressed ? inst->exec_size / 2 :
85 inst->exec_size;
86
87 /* XXX - The equation above is strictly speaking not correct on
88 * hardware that supports unbalanced GRF writes -- On Gen9+
89 * each decompressed chunk of the instruction may have a
90 * different execution size when the number of components
91 * written to each destination GRF is not the same.
92 */
93 if (reg->stride > 4) {
94 /* For registers with an exceptionally large stride, we use a
95 * width of 1 and only use the vertical stride. This only works
96 * for sources since destinations require hstride == 1.
97 */
98 assert(reg != &inst->dst);
99 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
100 brw_reg = stride(brw_reg, reg->stride, 1, 0);
101 } else {
102 const unsigned width = MIN2(reg_width, phys_width);
103 brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
104 brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
105 }
106
107 if (devinfo->gen == 7 && !devinfo->is_haswell) {
108 /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
109 * "Each DF (Double Float) operand uses an element size of 4 rather
110 * than 8 and all regioning parameters are twice what the values
111 * would be based on the true element size: ExecSize, Width,
112 * HorzStride, and VertStride. Each DF operand uses a pair of
113 * channels and all masking and swizzing should be adjusted
114 * appropriately."
115 *
116 * From the IvyBridge PRM (Special Requirements for Handling Double
117 * Precision Data Types, page 71):
118 * "In Align1 mode, all regioning parameters like stride, execution
119 * size, and width must use the syntax of a pair of packed
120 * floats. The offsets for these data types must be 64-bit
121 * aligned. The execution size and regioning parameters are in terms
122 * of floats."
123 *
124 * Summarized: when handling DF-typed arguments, ExecSize,
125 * VertStride, and Width must be doubled.
126 *
127 * It applies to BayTrail too.
128 */
129 if (type_sz(reg->type) == 8) {
130 brw_reg.width++;
131 if (brw_reg.vstride > 0)
132 brw_reg.vstride++;
133 assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
134 }
135
136 /* When converting from DF->F, we set the destination stride to 2
137 * because each d2f conversion implicitly writes 2 floats, being
138 * the first one the converted value. IVB/BYT actually writes two
139 * F components per SIMD channel, and every other component is
140 * filled with garbage.
141 */
142 if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
143 type_sz(inst->dst.type) < 8) {
144 assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
145 brw_reg.hstride--;
146 }
147 }
148 }
149
150 brw_reg = retype(brw_reg, reg->type);
151 brw_reg = byte_offset(brw_reg, reg->offset);
152 brw_reg.abs = reg->abs;
153 brw_reg.negate = reg->negate;
154 break;
155 case ARF:
156 case FIXED_GRF:
157 case IMM:
158 assert(reg->offset == 0);
159 brw_reg = reg->as_brw_reg();
160 break;
161 case BAD_FILE:
162 /* Probably unused. */
163 brw_reg = brw_null_reg();
164 break;
165 case ATTR:
166 case UNIFORM:
167 unreachable("not reached");
168 }
169
170 /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
171 * region, but on IVB and BYT DF regions must be programmed in terms of
172 * floats. A <0,2,1> region accomplishes this.
173 */
174 if (devinfo->gen == 7 && !devinfo->is_haswell &&
175 type_sz(reg->type) == 8 &&
176 brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
177 brw_reg.width == BRW_WIDTH_1 &&
178 brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
179 brw_reg.width = BRW_WIDTH_2;
180 brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
181 }
182
183 return brw_reg;
184 }
185
186 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
187 void *mem_ctx,
188 const void *key,
189 struct brw_stage_prog_data *prog_data,
190 unsigned promoted_constants,
191 bool runtime_check_aads_emit,
192 gl_shader_stage stage)
193
194 : compiler(compiler), log_data(log_data),
195 devinfo(compiler->devinfo), key(key),
196 prog_data(prog_data),
197 promoted_constants(promoted_constants),
198 runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
199 stage(stage), mem_ctx(mem_ctx)
200 {
201 p = rzalloc(mem_ctx, struct brw_codegen);
202 brw_init_codegen(devinfo, p, mem_ctx);
203
204 /* In the FS code generator, we are very careful to ensure that we always
205 * set the right execution size so we don't need the EU code to "help" us
206 * by trying to infer it. Sometimes, it infers the wrong thing.
207 */
208 p->automatic_exec_sizes = false;
209 }
210
211 fs_generator::~fs_generator()
212 {
213 }
214
215 class ip_record : public exec_node {
216 public:
217 DECLARE_RALLOC_CXX_OPERATORS(ip_record)
218
219 ip_record(int ip)
220 {
221 this->ip = ip;
222 }
223
224 int ip;
225 };
226
227 bool
228 fs_generator::patch_discard_jumps_to_fb_writes()
229 {
230 if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
231 return false;
232
233 int scale = brw_jump_scale(p->devinfo);
234
235 /* There is a somewhat strange undocumented requirement of using
236 * HALT, according to the simulator. If some channel has HALTed to
237 * a particular UIP, then by the end of the program, every channel
238 * must have HALTed to that UIP. Furthermore, the tracking is a
239 * stack, so you can't do the final halt of a UIP after starting
240 * halting to a new UIP.
241 *
242 * Symptoms of not emitting this instruction on actual hardware
243 * included GPU hangs and sparkly rendering on the piglit discard
244 * tests.
245 */
246 brw_inst *last_halt = gen6_HALT(p);
247 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
248 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
249
250 int ip = p->nr_insn;
251
252 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
253 brw_inst *patch = &p->store[patch_ip->ip];
254
255 assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
256 /* HALT takes a half-instruction distance from the pre-incremented IP. */
257 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
258 }
259
260 this->discard_halt_patches.make_empty();
261 return true;
262 }
263
264 void
265 fs_generator::fire_fb_write(fs_inst *inst,
266 struct brw_reg payload,
267 struct brw_reg implied_header,
268 GLuint nr)
269 {
270 uint32_t msg_control;
271
272 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
273
274 if (devinfo->gen < 6) {
275 brw_push_insn_state(p);
276 brw_set_default_exec_size(p, BRW_EXECUTE_8);
277 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
278 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
279 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
280 brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
281 brw_pop_insn_state(p);
282 }
283
284 if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
285 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
286 else if (prog_data->dual_src_blend) {
287 if (!inst->group)
288 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
289 else
290 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
291 } else if (inst->exec_size == 16)
292 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
293 else
294 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
295
296 /* We assume render targets start at 0, because headerless FB write
297 * messages set "Render Target Index" to 0. Using a different binding
298 * table index would make it impossible to use headerless messages.
299 */
300 assert(prog_data->binding_table.render_target_start == 0);
301
302 const uint32_t surf_index = inst->target;
303
304 bool last_render_target = inst->eot ||
305 (prog_data->dual_src_blend && dispatch_width == 16);
306
307
308 brw_fb_WRITE(p,
309 payload,
310 implied_header,
311 msg_control,
312 surf_index,
313 nr,
314 0,
315 inst->eot,
316 last_render_target,
317 inst->header_size != 0);
318
319 brw_mark_surface_used(&prog_data->base, surf_index);
320 }
321
322 void
323 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
324 {
325 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
326 const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
327 struct brw_reg implied_header;
328
329 if (devinfo->gen < 8 && !devinfo->is_haswell) {
330 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
331 }
332
333 if (inst->base_mrf >= 0)
334 payload = brw_message_reg(inst->base_mrf);
335
336 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
337 * move, here's g1.
338 */
339 if (inst->header_size != 0) {
340 brw_push_insn_state(p);
341 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
342 brw_set_default_exec_size(p, BRW_EXECUTE_1);
343 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
344 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
345 brw_set_default_flag_reg(p, 0, 0);
346
347 /* On HSW, the GPU will use the predicate on SENDC, unless the header is
348 * present.
349 */
350 if (prog_data->uses_kill) {
351 struct brw_reg pixel_mask;
352
353 if (devinfo->gen >= 6)
354 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
355 else
356 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
357
358 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
359 }
360
361 if (devinfo->gen >= 6) {
362 brw_push_insn_state(p);
363 brw_set_default_exec_size(p, BRW_EXECUTE_16);
364 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
365 brw_MOV(p,
366 retype(payload, BRW_REGISTER_TYPE_UD),
367 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
368 brw_pop_insn_state(p);
369
370 if (inst->target > 0 && key->replicate_alpha) {
371 /* Set "Source0 Alpha Present to RenderTarget" bit in message
372 * header.
373 */
374 brw_OR(p,
375 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
376 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
377 brw_imm_ud(0x1 << 11));
378 }
379
380 if (inst->target > 0) {
381 /* Set the render target index for choosing BLEND_STATE. */
382 brw_MOV(p, retype(vec1(suboffset(payload, 2)),
383 BRW_REGISTER_TYPE_UD),
384 brw_imm_ud(inst->target));
385 }
386
387 /* Set computes stencil to render target */
388 if (prog_data->computed_stencil) {
389 brw_OR(p,
390 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
391 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
392 brw_imm_ud(0x1 << 14));
393 }
394
395 implied_header = brw_null_reg();
396 } else {
397 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
398 }
399
400 brw_pop_insn_state(p);
401 } else {
402 implied_header = brw_null_reg();
403 }
404
405 if (!runtime_check_aads_emit) {
406 fire_fb_write(inst, payload, implied_header, inst->mlen);
407 } else {
408 /* This can only happen in gen < 6 */
409 assert(devinfo->gen < 6);
410
411 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
412
413 /* Check runtime bit to detect if we have to send AA data or not */
414 brw_push_insn_state(p);
415 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
416 brw_set_default_exec_size(p, BRW_EXECUTE_1);
417 brw_AND(p,
418 v1_null_ud,
419 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
420 brw_imm_ud(1<<26));
421 brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
422
423 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
424 brw_pop_insn_state(p);
425 {
426 /* Don't send AA data */
427 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
428 }
429 brw_land_fwd_jump(p, jmp);
430 fire_fb_write(inst, payload, implied_header, inst->mlen);
431 }
432 }
433
434 void
435 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
436 struct brw_reg payload)
437 {
438 assert(inst->size_written % REG_SIZE == 0);
439 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
440 const unsigned surf_index =
441 prog_data->binding_table.render_target_start + inst->target;
442
443 gen9_fb_READ(p, dst, payload, surf_index,
444 inst->header_size, inst->size_written / REG_SIZE,
445 prog_data->persample_dispatch);
446
447 brw_mark_surface_used(&prog_data->base, surf_index);
448 }
449
450 void
451 fs_generator::generate_mov_indirect(fs_inst *inst,
452 struct brw_reg dst,
453 struct brw_reg reg,
454 struct brw_reg indirect_byte_offset)
455 {
456 assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
457 assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
458 assert(!reg.abs && !reg.negate);
459 assert(reg.type == dst.type);
460
461 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
462
463 if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
464 imm_byte_offset += indirect_byte_offset.ud;
465
466 reg.nr = imm_byte_offset / REG_SIZE;
467 reg.subnr = imm_byte_offset % REG_SIZE;
468 brw_MOV(p, dst, reg);
469 } else {
470 /* Prior to Broadwell, there are only 8 address registers. */
471 assert(inst->exec_size <= 8 || devinfo->gen >= 8);
472
473 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
474 struct brw_reg addr = vec8(brw_address_reg(0));
475
476 /* The destination stride of an instruction (in bytes) must be greater
477 * than or equal to the size of the rest of the instruction. Since the
478 * address register is of type UW, we can't use a D-type instruction.
479 * In order to get around this, re retype to UW and use a stride.
480 */
481 indirect_byte_offset =
482 retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
483
484 /* There are a number of reasons why we don't use the base offset here.
485 * One reason is that the field is only 9 bits which means we can only
486 * use it to access the first 16 GRFs. Also, from the Haswell PRM
487 * section "Register Region Restrictions":
488 *
489 * "The lower bits of the AddressImmediate must not overflow to
490 * change the register address. The lower 5 bits of Address
491 * Immediate when added to lower 5 bits of address register gives
492 * the sub-register offset. The upper bits of Address Immediate
493 * when added to upper bits of address register gives the register
494 * address. Any overflow from sub-register offset is dropped."
495 *
496 * Since the indirect may cause us to cross a register boundary, this
497 * makes the base offset almost useless. We could try and do something
498 * clever where we use a actual base offset if base_offset % 32 == 0 but
499 * that would mean we were generating different code depending on the
500 * base offset. Instead, for the sake of consistency, we'll just do the
501 * add ourselves. This restriction is only listed in the Haswell PRM
502 * but empirical testing indicates that it applies on all older
503 * generations and is lifted on Broadwell.
504 *
505 * In the end, while base_offset is nice to look at in the generated
506 * code, using it saves us 0 instructions and would require quite a bit
507 * of case-by-case work. It's just not worth it.
508 */
509 brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
510
511 if (type_sz(reg.type) > 4 &&
512 ((devinfo->gen == 7 && !devinfo->is_haswell) ||
513 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
514 /* IVB has an issue (which we found empirically) where it reads two
515 * address register components per channel for indirectly addressed
516 * 64-bit sources.
517 *
518 * From the Cherryview PRM Vol 7. "Register Region Restrictions":
519 *
520 * "When source or destination datatype is 64b or operation is
521 * integer DWord multiply, indirect addressing must not be used."
522 *
523 * To work around both of these, we do two integer MOVs insead of one
524 * 64-bit MOV. Because no double value should ever cross a register
525 * boundary, it's safe to use the immediate offset in the indirect
526 * here to handle adding 4 bytes to the offset and avoid the extra
527 * ADD to the register file.
528 */
529 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
530 retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
531 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
532 retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
533 } else {
534 struct brw_reg ind_src = brw_VxH_indirect(0, 0);
535
536 brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
537
538 if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
539 !inst->get_next()->is_tail_sentinel() &&
540 ((fs_inst *)inst->get_next())->mlen > 0) {
541 /* From the Sandybridge PRM:
542 *
543 * "[Errata: DevSNB(SNB)] If MRF register is updated by any
544 * instruction that “indexed/indirect” source AND is followed
545 * by a send, the instruction requires a “Switch”. This is to
546 * avoid race condition where send may dispatch before MRF is
547 * updated."
548 */
549 brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
550 }
551 }
552 }
553 }
554
555 void
556 fs_generator::generate_urb_read(fs_inst *inst,
557 struct brw_reg dst,
558 struct brw_reg header)
559 {
560 assert(inst->size_written % REG_SIZE == 0);
561 assert(header.file == BRW_GENERAL_REGISTER_FILE);
562 assert(header.type == BRW_REGISTER_TYPE_UD);
563
564 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
565 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
566 brw_set_src0(p, send, header);
567 brw_set_src1(p, send, brw_imm_ud(0u));
568
569 brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
570 brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
571
572 if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
573 brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
574
575 brw_inst_set_mlen(p->devinfo, send, inst->mlen);
576 brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
577 brw_inst_set_header_present(p->devinfo, send, true);
578 brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
579 }
580
581 void
582 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
583 {
584 brw_inst *insn;
585
586 insn = brw_next_insn(p, BRW_OPCODE_SEND);
587
588 brw_set_dest(p, insn, brw_null_reg());
589 brw_set_src0(p, insn, payload);
590 brw_set_src1(p, insn, brw_imm_d(0));
591
592 brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
593 brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
594
595 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
596 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
597 brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
598
599 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
600 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
601 brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
602
603 brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
604 brw_inst_set_rlen(p->devinfo, insn, 0);
605 brw_inst_set_eot(p->devinfo, insn, inst->eot);
606 brw_inst_set_header_present(p->devinfo, insn, true);
607 brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
608 }
609
610 void
611 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
612 {
613 struct brw_inst *insn;
614
615 insn = brw_next_insn(p, BRW_OPCODE_SEND);
616
617 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
618 brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
619 brw_set_src1(p, insn, brw_imm_d(0));
620
621 /* Terminate a compute shader by sending a message to the thread spawner.
622 */
623 brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
624 brw_inst_set_mlen(devinfo, insn, 1);
625 brw_inst_set_rlen(devinfo, insn, 0);
626 brw_inst_set_eot(devinfo, insn, inst->eot);
627 brw_inst_set_header_present(devinfo, insn, false);
628
629 brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
630 brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
631
632 /* Note that even though the thread has a URB resource associated with it,
633 * we set the "do not dereference URB" bit, because the URB resource is
634 * managed by the fixed-function unit, so it will free it automatically.
635 */
636 brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
637
638 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
639 }
640
641 void
642 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
643 {
644 brw_barrier(p, src);
645 brw_WAIT(p);
646 }
647
648 void
649 fs_generator::generate_linterp(fs_inst *inst,
650 struct brw_reg dst, struct brw_reg *src)
651 {
652 /* PLN reads:
653 * / in SIMD16 \
654 * -----------------------------------
655 * | src1+0 | src1+1 | src1+2 | src1+3 |
656 * |-----------------------------------|
657 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
658 * -----------------------------------
659 *
660 * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
661 *
662 * -----------------------------------
663 * | src1+0 | src1+1 | src1+2 | src1+3 |
664 * |-----------------------------------|
665 * |(x0, x1)|(y0, y1)| | | in SIMD8
666 * |-----------------------------------|
667 * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
668 * -----------------------------------
669 *
670 * See also: emit_interpolation_setup_gen4().
671 */
672 struct brw_reg delta_x = src[0];
673 struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
674 struct brw_reg interp = src[1];
675
676 if (devinfo->has_pln &&
677 (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
678 brw_PLN(p, dst, interp, delta_x);
679 } else {
680 brw_LINE(p, brw_null_reg(), interp, delta_x);
681 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
682 }
683 }
684
685 void
686 fs_generator::generate_get_buffer_size(fs_inst *inst,
687 struct brw_reg dst,
688 struct brw_reg src,
689 struct brw_reg surf_index)
690 {
691 assert(devinfo->gen >= 7);
692 assert(surf_index.file == BRW_IMMEDIATE_VALUE);
693
694 uint32_t simd_mode;
695 int rlen = 4;
696
697 switch (inst->exec_size) {
698 case 8:
699 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
700 break;
701 case 16:
702 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
703 break;
704 default:
705 unreachable("Invalid width for texture instruction");
706 }
707
708 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
709 rlen = 8;
710 dst = vec16(dst);
711 }
712
713 brw_SAMPLE(p,
714 retype(dst, BRW_REGISTER_TYPE_UW),
715 inst->base_mrf,
716 src,
717 surf_index.ud,
718 0,
719 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
720 rlen, /* response length */
721 inst->mlen,
722 inst->header_size > 0,
723 simd_mode,
724 BRW_SAMPLER_RETURN_FORMAT_SINT32);
725
726 brw_mark_surface_used(prog_data, surf_index.ud);
727 }
728
729 void
730 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
731 struct brw_reg surface_index,
732 struct brw_reg sampler_index)
733 {
734 assert(inst->size_written % REG_SIZE == 0);
735 int msg_type = -1;
736 uint32_t simd_mode;
737 uint32_t return_format;
738 bool is_combined_send = inst->eot;
739
740 switch (dst.type) {
741 case BRW_REGISTER_TYPE_D:
742 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
743 break;
744 case BRW_REGISTER_TYPE_UD:
745 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
746 break;
747 default:
748 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
749 break;
750 }
751
752 /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type
753 * is set as part of the message descriptor. On gen4, the PRM seems to
754 * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
755 * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is
756 * gone from the message descriptor entirely and you just get UINT32 all
757 * the time regasrdless. Since we can really only do non-UINT32 on gen4,
758 * just stomp it to UINT32 all the time.
759 */
760 if (inst->opcode == SHADER_OPCODE_TXS)
761 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
762
763 switch (inst->exec_size) {
764 case 8:
765 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
766 break;
767 case 16:
768 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
769 break;
770 default:
771 unreachable("Invalid width for texture instruction");
772 }
773
774 if (devinfo->gen >= 5) {
775 switch (inst->opcode) {
776 case SHADER_OPCODE_TEX:
777 if (inst->shadow_compare) {
778 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
779 } else {
780 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
781 }
782 break;
783 case FS_OPCODE_TXB:
784 if (inst->shadow_compare) {
785 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
786 } else {
787 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
788 }
789 break;
790 case SHADER_OPCODE_TXL:
791 if (inst->shadow_compare) {
792 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
793 } else {
794 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
795 }
796 break;
797 case SHADER_OPCODE_TXL_LZ:
798 assert(devinfo->gen >= 9);
799 if (inst->shadow_compare) {
800 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
801 } else {
802 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
803 }
804 break;
805 case SHADER_OPCODE_TXS:
806 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
807 break;
808 case SHADER_OPCODE_TXD:
809 if (inst->shadow_compare) {
810 /* Gen7.5+. Otherwise, lowered in NIR */
811 assert(devinfo->gen >= 8 || devinfo->is_haswell);
812 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
813 } else {
814 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
815 }
816 break;
817 case SHADER_OPCODE_TXF:
818 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
819 break;
820 case SHADER_OPCODE_TXF_LZ:
821 assert(devinfo->gen >= 9);
822 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
823 break;
824 case SHADER_OPCODE_TXF_CMS_W:
825 assert(devinfo->gen >= 9);
826 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
827 break;
828 case SHADER_OPCODE_TXF_CMS:
829 if (devinfo->gen >= 7)
830 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
831 else
832 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
833 break;
834 case SHADER_OPCODE_TXF_UMS:
835 assert(devinfo->gen >= 7);
836 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
837 break;
838 case SHADER_OPCODE_TXF_MCS:
839 assert(devinfo->gen >= 7);
840 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
841 break;
842 case SHADER_OPCODE_LOD:
843 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
844 break;
845 case SHADER_OPCODE_TG4:
846 if (inst->shadow_compare) {
847 assert(devinfo->gen >= 7);
848 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
849 } else {
850 assert(devinfo->gen >= 6);
851 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
852 }
853 break;
854 case SHADER_OPCODE_TG4_OFFSET:
855 assert(devinfo->gen >= 7);
856 if (inst->shadow_compare) {
857 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
858 } else {
859 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
860 }
861 break;
862 case SHADER_OPCODE_SAMPLEINFO:
863 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
864 break;
865 default:
866 unreachable("not reached");
867 }
868 } else {
869 switch (inst->opcode) {
870 case SHADER_OPCODE_TEX:
871 /* Note that G45 and older determines shadow compare and dispatch width
872 * from message length for most messages.
873 */
874 if (inst->exec_size == 8) {
875 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
876 if (inst->shadow_compare) {
877 assert(inst->mlen == 6);
878 } else {
879 assert(inst->mlen <= 4);
880 }
881 } else {
882 if (inst->shadow_compare) {
883 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
884 assert(inst->mlen == 9);
885 } else {
886 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
887 assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
888 }
889 }
890 break;
891 case FS_OPCODE_TXB:
892 if (inst->shadow_compare) {
893 assert(inst->exec_size == 8);
894 assert(inst->mlen == 6);
895 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
896 } else {
897 assert(inst->mlen == 9);
898 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
899 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
900 }
901 break;
902 case SHADER_OPCODE_TXL:
903 if (inst->shadow_compare) {
904 assert(inst->exec_size == 8);
905 assert(inst->mlen == 6);
906 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
907 } else {
908 assert(inst->mlen == 9);
909 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
910 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
911 }
912 break;
913 case SHADER_OPCODE_TXD:
914 /* There is no sample_d_c message; comparisons are done manually */
915 assert(inst->exec_size == 8);
916 assert(inst->mlen == 7 || inst->mlen == 10);
917 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
918 break;
919 case SHADER_OPCODE_TXF:
920 assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
921 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
922 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
923 break;
924 case SHADER_OPCODE_TXS:
925 assert(inst->mlen == 3);
926 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
927 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
928 break;
929 default:
930 unreachable("not reached");
931 }
932 }
933 assert(msg_type != -1);
934
935 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
936 dst = vec16(dst);
937 }
938
939 assert(devinfo->gen < 7 || inst->header_size == 0 ||
940 src.file == BRW_GENERAL_REGISTER_FILE);
941
942 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
943
944 /* Load the message header if present. If there's a texture offset,
945 * we need to set it up explicitly and load the offset bitfield.
946 * Otherwise, we can use an implied move from g0 to the first message reg.
947 */
948 if (inst->header_size != 0) {
949 if (devinfo->gen < 6 && !inst->offset) {
950 /* Set up an implied move from g0 to the MRF. */
951 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
952 } else {
953 struct brw_reg header_reg;
954
955 if (devinfo->gen >= 7) {
956 header_reg = src;
957 } else {
958 assert(inst->base_mrf != -1);
959 header_reg = brw_message_reg(inst->base_mrf);
960 }
961
962 brw_push_insn_state(p);
963 brw_set_default_exec_size(p, BRW_EXECUTE_8);
964 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
965 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
966 /* Explicitly set up the message header by copying g0 to the MRF. */
967 brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
968
969 brw_set_default_exec_size(p, BRW_EXECUTE_1);
970 if (inst->offset) {
971 /* Set the offset bits in DWord 2. */
972 brw_MOV(p, get_element_ud(header_reg, 2),
973 brw_imm_ud(inst->offset));
974 } else if (stage != MESA_SHADER_VERTEX &&
975 stage != MESA_SHADER_FRAGMENT) {
976 /* The vertex and fragment stages have g0.2 set to 0, so
977 * header0.2 is 0 when g0 is copied. Other stages may not, so we
978 * must set it to 0 to avoid setting undesirable bits in the
979 * message.
980 */
981 brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
982 }
983
984 brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
985 brw_pop_insn_state(p);
986 }
987 }
988
989 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
990 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
991 ? prog_data->binding_table.gather_texture_start
992 : prog_data->binding_table.texture_start;
993
994 if (surface_index.file == BRW_IMMEDIATE_VALUE &&
995 sampler_index.file == BRW_IMMEDIATE_VALUE) {
996 uint32_t surface = surface_index.ud;
997 uint32_t sampler = sampler_index.ud;
998
999 brw_SAMPLE(p,
1000 retype(dst, BRW_REGISTER_TYPE_UW),
1001 inst->base_mrf,
1002 src,
1003 surface + base_binding_table_index,
1004 sampler % 16,
1005 msg_type,
1006 inst->size_written / REG_SIZE,
1007 inst->mlen,
1008 inst->header_size != 0,
1009 simd_mode,
1010 return_format);
1011
1012 brw_mark_surface_used(prog_data, surface + base_binding_table_index);
1013 } else {
1014 /* Non-const sampler index */
1015
1016 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1017 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
1018 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
1019
1020 brw_push_insn_state(p);
1021 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1022 brw_set_default_access_mode(p, BRW_ALIGN_1);
1023 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1024
1025 if (brw_regs_equal(&surface_reg, &sampler_reg)) {
1026 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
1027 } else {
1028 if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
1029 brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
1030 } else {
1031 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
1032 brw_OR(p, addr, addr, surface_reg);
1033 }
1034 }
1035 if (base_binding_table_index)
1036 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
1037 brw_AND(p, addr, addr, brw_imm_ud(0xfff));
1038
1039 brw_pop_insn_state(p);
1040
1041 /* dst = send(offset, a0.0 | <descriptor>) */
1042 brw_inst *insn = brw_send_indirect_message(
1043 p, BRW_SFID_SAMPLER, dst, src, addr);
1044 brw_set_sampler_message(p, insn,
1045 0 /* surface */,
1046 0 /* sampler */,
1047 msg_type,
1048 inst->size_written / REG_SIZE,
1049 inst->mlen /* mlen */,
1050 inst->header_size != 0 /* header */,
1051 simd_mode,
1052 return_format);
1053
1054 /* visitor knows more than we do about the surface limit required,
1055 * so has already done marking.
1056 */
1057 }
1058
1059 if (is_combined_send) {
1060 brw_inst_set_eot(p->devinfo, brw_last_inst, true);
1061 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
1062 }
1063 }
1064
1065
1066 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1067 * looking like:
1068 *
1069 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1070 *
1071 * Ideally, we want to produce:
1072 *
1073 * DDX DDY
1074 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
1075 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
1076 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
1077 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
1078 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
1079 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
1080 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
1081 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
1082 *
1083 * and add another set of two more subspans if in 16-pixel dispatch mode.
1084 *
1085 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1086 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1087 * pair. But the ideal approximation may impose a huge performance cost on
1088 * sample_d. On at least Haswell, sample_d instruction does some
1089 * optimizations if the same LOD is used for all pixels in the subspan.
1090 *
1091 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
1092 * appropriate swizzling.
1093 */
1094 void
1095 fs_generator::generate_ddx(enum opcode opcode,
1096 struct brw_reg dst, struct brw_reg src)
1097 {
1098 unsigned vstride, width;
1099
1100 if (opcode == FS_OPCODE_DDX_FINE) {
1101 /* produce accurate derivatives */
1102 vstride = BRW_VERTICAL_STRIDE_2;
1103 width = BRW_WIDTH_2;
1104 } else {
1105 /* replicate the derivative at the top-left pixel to other pixels */
1106 vstride = BRW_VERTICAL_STRIDE_4;
1107 width = BRW_WIDTH_4;
1108 }
1109
1110 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1111 src.negate, src.abs,
1112 BRW_REGISTER_TYPE_F,
1113 vstride,
1114 width,
1115 BRW_HORIZONTAL_STRIDE_0,
1116 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1117 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1118 src.negate, src.abs,
1119 BRW_REGISTER_TYPE_F,
1120 vstride,
1121 width,
1122 BRW_HORIZONTAL_STRIDE_0,
1123 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1124 brw_ADD(p, dst, src0, negate(src1));
1125 }
1126
1127 /* The negate_value boolean is used to negate the derivative computation for
1128 * FBOs, since they place the origin at the upper left instead of the lower
1129 * left.
1130 */
1131 void
1132 fs_generator::generate_ddy(enum opcode opcode,
1133 struct brw_reg dst, struct brw_reg src)
1134 {
1135 if (opcode == FS_OPCODE_DDY_FINE) {
1136 /* produce accurate derivatives */
1137 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1138 src.negate, src.abs,
1139 BRW_REGISTER_TYPE_F,
1140 BRW_VERTICAL_STRIDE_4,
1141 BRW_WIDTH_4,
1142 BRW_HORIZONTAL_STRIDE_1,
1143 BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
1144 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1145 src.negate, src.abs,
1146 BRW_REGISTER_TYPE_F,
1147 BRW_VERTICAL_STRIDE_4,
1148 BRW_WIDTH_4,
1149 BRW_HORIZONTAL_STRIDE_1,
1150 BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
1151 brw_push_insn_state(p);
1152 brw_set_default_access_mode(p, BRW_ALIGN_16);
1153 brw_ADD(p, dst, negate(src0), src1);
1154 brw_pop_insn_state(p);
1155 } else {
1156 /* replicate the derivative at the top-left pixel to other pixels */
1157 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1158 src.negate, src.abs,
1159 BRW_REGISTER_TYPE_F,
1160 BRW_VERTICAL_STRIDE_4,
1161 BRW_WIDTH_4,
1162 BRW_HORIZONTAL_STRIDE_0,
1163 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1164 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1165 src.negate, src.abs,
1166 BRW_REGISTER_TYPE_F,
1167 BRW_VERTICAL_STRIDE_4,
1168 BRW_WIDTH_4,
1169 BRW_HORIZONTAL_STRIDE_0,
1170 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1171 brw_ADD(p, dst, negate(src0), src1);
1172 }
1173 }
1174
1175 void
1176 fs_generator::generate_discard_jump(fs_inst *inst)
1177 {
1178 assert(devinfo->gen >= 6);
1179
1180 /* This HALT will be patched up at FB write time to point UIP at the end of
1181 * the program, and at brw_uip_jip() JIP will be set to the end of the
1182 * current block (or the program).
1183 */
1184 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1185 gen6_HALT(p);
1186 }
1187
1188 void
1189 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1190 {
1191 /* The 32-wide messages only respect the first 16-wide half of the channel
1192 * enable signals which are replicated identically for the second group of
1193 * 16 channels, so we cannot use them unless the write is marked
1194 * force_writemask_all.
1195 */
1196 const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1197 MIN2(16, inst->exec_size);
1198 const unsigned block_size = 4 * lower_size / REG_SIZE;
1199 assert(inst->mlen != 0);
1200
1201 brw_push_insn_state(p);
1202 brw_set_default_exec_size(p, cvt(lower_size) - 1);
1203 brw_set_default_compression(p, lower_size > 8);
1204
1205 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1206 brw_set_default_group(p, inst->group + lower_size * i);
1207
1208 brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1209 retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1210
1211 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1212 block_size,
1213 inst->offset + block_size * REG_SIZE * i);
1214 }
1215
1216 brw_pop_insn_state(p);
1217 }
1218
1219 void
1220 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1221 {
1222 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1223 assert(inst->mlen != 0);
1224
1225 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1226 inst->exec_size / 8, inst->offset);
1227 }
1228
1229 void
1230 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1231 {
1232 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1233
1234 gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1235 }
1236
1237 void
1238 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1239 struct brw_reg dst,
1240 struct brw_reg index,
1241 struct brw_reg offset)
1242 {
1243 assert(type_sz(dst.type) == 4);
1244 assert(inst->mlen != 0);
1245
1246 assert(index.file == BRW_IMMEDIATE_VALUE &&
1247 index.type == BRW_REGISTER_TYPE_UD);
1248 uint32_t surf_index = index.ud;
1249
1250 assert(offset.file == BRW_IMMEDIATE_VALUE &&
1251 offset.type == BRW_REGISTER_TYPE_UD);
1252 uint32_t read_offset = offset.ud;
1253
1254 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1255 read_offset, surf_index);
1256 }
1257
1258 void
1259 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1260 struct brw_reg dst,
1261 struct brw_reg index,
1262 struct brw_reg payload)
1263 {
1264 assert(index.type == BRW_REGISTER_TYPE_UD);
1265 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1266 assert(type_sz(dst.type) == 4);
1267
1268 if (index.file == BRW_IMMEDIATE_VALUE) {
1269 const uint32_t surf_index = index.ud;
1270
1271 brw_push_insn_state(p);
1272 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1273 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1274 brw_pop_insn_state(p);
1275
1276 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1277 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1278 brw_set_dp_read_message(p, send, surf_index,
1279 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1280 GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1281 GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1282 1, /* mlen */
1283 true, /* header */
1284 DIV_ROUND_UP(inst->size_written, REG_SIZE));
1285
1286 } else {
1287 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1288
1289 brw_push_insn_state(p);
1290 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1291
1292 /* a0.0 = surf_index & 0xff */
1293 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1294 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1295 brw_set_dest(p, insn_and, addr);
1296 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1297 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1298
1299 /* dst = send(payload, a0.0 | <descriptor>) */
1300 brw_inst *insn = brw_send_indirect_message(
1301 p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1302 retype(dst, BRW_REGISTER_TYPE_UD),
1303 retype(payload, BRW_REGISTER_TYPE_UD), addr);
1304 brw_set_dp_read_message(p, insn, 0 /* surface */,
1305 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1306 GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1307 GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1308 1, /* mlen */
1309 true, /* header */
1310 DIV_ROUND_UP(inst->size_written, REG_SIZE));
1311
1312 brw_pop_insn_state(p);
1313 }
1314 }
1315
1316 void
1317 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
1318 struct brw_reg dst,
1319 struct brw_reg index)
1320 {
1321 assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1322 assert(inst->header_size != 0);
1323 assert(inst->mlen);
1324
1325 assert(index.file == BRW_IMMEDIATE_VALUE &&
1326 index.type == BRW_REGISTER_TYPE_UD);
1327 uint32_t surf_index = index.ud;
1328
1329 uint32_t simd_mode, rlen, msg_type;
1330 if (inst->exec_size == 16) {
1331 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1332 rlen = 8;
1333 } else {
1334 assert(inst->exec_size == 8);
1335 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1336 rlen = 4;
1337 }
1338
1339 if (devinfo->gen >= 5)
1340 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1341 else {
1342 /* We always use the SIMD16 message so that we only have to load U, and
1343 * not V or R.
1344 */
1345 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1346 assert(inst->mlen == 3);
1347 assert(inst->size_written == 8 * REG_SIZE);
1348 rlen = 8;
1349 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1350 }
1351
1352 struct brw_reg header = brw_vec8_grf(0, 0);
1353 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1354
1355 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1356 brw_inst_set_compression(devinfo, send, false);
1357 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1358 brw_set_src0(p, send, header);
1359 if (devinfo->gen < 6)
1360 brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1361
1362 /* Our surface is set up as floats, regardless of what actual data is
1363 * stored in it.
1364 */
1365 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1366 brw_set_sampler_message(p, send,
1367 surf_index,
1368 0, /* sampler (unused) */
1369 msg_type,
1370 rlen,
1371 inst->mlen,
1372 inst->header_size != 0,
1373 simd_mode,
1374 return_format);
1375 }
1376
1377 void
1378 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1379 struct brw_reg dst,
1380 struct brw_reg index,
1381 struct brw_reg offset)
1382 {
1383 assert(devinfo->gen >= 7);
1384 /* Varying-offset pull constant loads are treated as a normal expression on
1385 * gen7, so the fact that it's a send message is hidden at the IR level.
1386 */
1387 assert(inst->header_size == 0);
1388 assert(!inst->mlen);
1389 assert(index.type == BRW_REGISTER_TYPE_UD);
1390
1391 uint32_t simd_mode, rlen, mlen;
1392 if (inst->exec_size == 16) {
1393 mlen = 2;
1394 rlen = 8;
1395 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1396 } else {
1397 assert(inst->exec_size == 8);
1398 mlen = 1;
1399 rlen = 4;
1400 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1401 }
1402
1403 if (index.file == BRW_IMMEDIATE_VALUE) {
1404
1405 uint32_t surf_index = index.ud;
1406
1407 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1408 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1409 brw_set_src0(p, send, offset);
1410 brw_set_sampler_message(p, send,
1411 surf_index,
1412 0, /* LD message ignores sampler unit */
1413 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1414 rlen,
1415 mlen,
1416 false, /* no header */
1417 simd_mode,
1418 0);
1419
1420 } else {
1421
1422 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1423
1424 brw_push_insn_state(p);
1425 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1426
1427 /* a0.0 = surf_index & 0xff */
1428 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1429 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1430 brw_set_dest(p, insn_and, addr);
1431 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1432 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1433
1434 brw_pop_insn_state(p);
1435
1436 /* dst = send(offset, a0.0 | <descriptor>) */
1437 brw_inst *insn = brw_send_indirect_message(
1438 p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
1439 offset, addr);
1440 brw_set_sampler_message(p, insn,
1441 0 /* surface */,
1442 0 /* sampler */,
1443 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1444 rlen /* rlen */,
1445 mlen /* mlen */,
1446 false /* header */,
1447 simd_mode,
1448 0);
1449 }
1450 }
1451
1452 /**
1453 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1454 * into the flags register (f0.0).
1455 *
1456 * Used only on Gen6 and above.
1457 */
1458 void
1459 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1460 {
1461 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1462 struct brw_reg dispatch_mask;
1463
1464 if (devinfo->gen >= 6)
1465 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1466 else
1467 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1468
1469 brw_push_insn_state(p);
1470 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1471 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1472 brw_MOV(p, flags, dispatch_mask);
1473 brw_pop_insn_state(p);
1474 }
1475
1476 void
1477 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1478 struct brw_reg dst,
1479 struct brw_reg src,
1480 struct brw_reg msg_data,
1481 unsigned msg_type)
1482 {
1483 assert(inst->size_written % REG_SIZE == 0);
1484 assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1485
1486 brw_pixel_interpolator_query(p,
1487 retype(dst, BRW_REGISTER_TYPE_UW),
1488 src,
1489 inst->pi_noperspective,
1490 msg_type,
1491 msg_data,
1492 inst->mlen,
1493 inst->size_written / REG_SIZE);
1494 }
1495
1496 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1497 * the ADD instruction.
1498 */
1499 void
1500 fs_generator::generate_set_sample_id(fs_inst *inst,
1501 struct brw_reg dst,
1502 struct brw_reg src0,
1503 struct brw_reg src1)
1504 {
1505 assert(dst.type == BRW_REGISTER_TYPE_D ||
1506 dst.type == BRW_REGISTER_TYPE_UD);
1507 assert(src0.type == BRW_REGISTER_TYPE_D ||
1508 src0.type == BRW_REGISTER_TYPE_UD);
1509
1510 struct brw_reg reg = stride(src1, 1, 4, 0);
1511 if (devinfo->gen >= 8 || inst->exec_size == 8) {
1512 brw_ADD(p, dst, src0, reg);
1513 } else if (inst->exec_size == 16) {
1514 brw_push_insn_state(p);
1515 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1516 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1517 brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1518 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1519 brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1520 brw_pop_insn_state(p);
1521 }
1522 }
1523
1524 void
1525 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1526 struct brw_reg dst,
1527 struct brw_reg x,
1528 struct brw_reg y)
1529 {
1530 assert(devinfo->gen >= 7);
1531 assert(dst.type == BRW_REGISTER_TYPE_UD);
1532 assert(x.type == BRW_REGISTER_TYPE_F);
1533 assert(y.type == BRW_REGISTER_TYPE_F);
1534
1535 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1536 *
1537 * Because this instruction does not have a 16-bit floating-point type,
1538 * the destination data type must be Word (W).
1539 *
1540 * The destination must be DWord-aligned and specify a horizontal stride
1541 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
1542 * each destination channel and the upper word is not modified.
1543 */
1544 struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1545
1546 /* Give each 32-bit channel of dst the form below, where "." means
1547 * unchanged.
1548 * 0x....hhhh
1549 */
1550 brw_F32TO16(p, dst_w, y);
1551
1552 /* Now the form:
1553 * 0xhhhh0000
1554 */
1555 brw_SHL(p, dst, dst, brw_imm_ud(16u));
1556
1557 /* And, finally the form of packHalf2x16's output:
1558 * 0xhhhhllll
1559 */
1560 brw_F32TO16(p, dst_w, x);
1561 }
1562
1563 void
1564 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1565 struct brw_reg dst,
1566 struct brw_reg src)
1567 {
1568 assert(devinfo->gen >= 7);
1569 assert(dst.type == BRW_REGISTER_TYPE_F);
1570 assert(src.type == BRW_REGISTER_TYPE_UD);
1571
1572 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1573 *
1574 * Because this instruction does not have a 16-bit floating-point type,
1575 * the source data type must be Word (W). The destination type must be
1576 * F (Float).
1577 */
1578 struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1579
1580 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1581 * For the Y case, we wish to access only the upper word; therefore
1582 * a 16-bit subregister offset is needed.
1583 */
1584 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1585 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1586 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1587 src_w.subnr += 2;
1588
1589 brw_F16TO32(p, dst, src_w);
1590 }
1591
1592 void
1593 fs_generator::generate_shader_time_add(fs_inst *inst,
1594 struct brw_reg payload,
1595 struct brw_reg offset,
1596 struct brw_reg value)
1597 {
1598 assert(devinfo->gen >= 7);
1599 brw_push_insn_state(p);
1600 brw_set_default_mask_control(p, true);
1601
1602 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1603 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1604 offset.type);
1605 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1606 value.type);
1607
1608 assert(offset.file == BRW_IMMEDIATE_VALUE);
1609 if (value.file == BRW_GENERAL_REGISTER_FILE) {
1610 value.width = BRW_WIDTH_1;
1611 value.hstride = BRW_HORIZONTAL_STRIDE_0;
1612 value.vstride = BRW_VERTICAL_STRIDE_0;
1613 } else {
1614 assert(value.file == BRW_IMMEDIATE_VALUE);
1615 }
1616
1617 /* Trying to deal with setup of the params from the IR is crazy in the FS8
1618 * case, and we don't really care about squeezing every bit of performance
1619 * out of this path, so we just emit the MOVs from here.
1620 */
1621 brw_MOV(p, payload_offset, offset);
1622 brw_MOV(p, payload_value, value);
1623 brw_shader_time_add(p, payload,
1624 prog_data->binding_table.shader_time_start);
1625 brw_pop_insn_state(p);
1626
1627 brw_mark_surface_used(prog_data,
1628 prog_data->binding_table.shader_time_start);
1629 }
1630
1631 void
1632 fs_generator::enable_debug(const char *shader_name)
1633 {
1634 debug_flag = true;
1635 this->shader_name = shader_name;
1636 }
1637
1638 int
1639 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1640 {
1641 /* align to 64 byte boundary. */
1642 while (p->next_insn_offset % 64)
1643 brw_NOP(p);
1644
1645 this->dispatch_width = dispatch_width;
1646
1647 int start_offset = p->next_insn_offset;
1648 int spill_count = 0, fill_count = 0;
1649 int loop_count = 0;
1650
1651 struct annotation_info annotation;
1652 memset(&annotation, 0, sizeof(annotation));
1653
1654 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1655 struct brw_reg src[3], dst;
1656 unsigned int last_insn_offset = p->next_insn_offset;
1657 bool multiple_instructions_emitted = false;
1658
1659 /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1660 * "Register Region Restrictions" section: for BDW, SKL:
1661 *
1662 * "A POW/FDIV operation must not be followed by an instruction
1663 * that requires two destination registers."
1664 *
1665 * The documentation is often lacking annotations for Atom parts,
1666 * and empirically this affects CHV as well.
1667 */
1668 if (devinfo->gen >= 8 &&
1669 devinfo->gen <= 9 &&
1670 p->nr_insn > 1 &&
1671 brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1672 brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1673 inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1674 brw_NOP(p);
1675 last_insn_offset = p->next_insn_offset;
1676 }
1677
1678 if (unlikely(debug_flag))
1679 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
1680
1681 /* If the instruction writes to more than one register, it needs to be
1682 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
1683 * hardware figures out by itself what the right compression mode is,
1684 * but we still need to know whether the instruction is compressed to
1685 * set up the source register regions appropriately.
1686 *
1687 * XXX - This is wrong for instructions that write a single register but
1688 * read more than one which should strictly speaking be treated as
1689 * compressed. For instructions that don't write any registers it
1690 * relies on the destination being a null register of the correct
1691 * type and regioning so the instruction is considered compressed
1692 * or not accordingly.
1693 */
1694 const bool compressed =
1695 inst->dst.component_size(inst->exec_size) > REG_SIZE;
1696 brw_set_default_compression(p, compressed);
1697 brw_set_default_group(p, inst->group);
1698
1699 for (unsigned int i = 0; i < inst->sources; i++) {
1700 src[i] = brw_reg_from_fs_reg(devinfo, inst,
1701 &inst->src[i], compressed);
1702 /* The accumulator result appears to get used for the
1703 * conditional modifier generation. When negating a UD
1704 * value, there is a 33rd bit generated for the sign in the
1705 * accumulator value, so now you can't check, for example,
1706 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
1707 */
1708 assert(!inst->conditional_mod ||
1709 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1710 !inst->src[i].negate);
1711 }
1712 dst = brw_reg_from_fs_reg(devinfo, inst,
1713 &inst->dst, compressed);
1714
1715 brw_set_default_access_mode(p, BRW_ALIGN_1);
1716 brw_set_default_predicate_control(p, inst->predicate);
1717 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1718 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1719 brw_set_default_saturate(p, inst->saturate);
1720 brw_set_default_mask_control(p, inst->force_writemask_all);
1721 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1722
1723 unsigned exec_size = inst->exec_size;
1724 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1725 (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
1726 exec_size *= 2;
1727 }
1728
1729 brw_set_default_exec_size(p, cvt(exec_size) - 1);
1730
1731 assert(inst->force_writemask_all || inst->exec_size >= 4);
1732 assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1733 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1734 assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1735
1736 switch (inst->opcode) {
1737 case BRW_OPCODE_MOV:
1738 brw_MOV(p, dst, src[0]);
1739 break;
1740 case BRW_OPCODE_ADD:
1741 brw_ADD(p, dst, src[0], src[1]);
1742 break;
1743 case BRW_OPCODE_MUL:
1744 brw_MUL(p, dst, src[0], src[1]);
1745 break;
1746 case BRW_OPCODE_AVG:
1747 brw_AVG(p, dst, src[0], src[1]);
1748 break;
1749 case BRW_OPCODE_MACH:
1750 brw_MACH(p, dst, src[0], src[1]);
1751 break;
1752
1753 case BRW_OPCODE_LINE:
1754 brw_LINE(p, dst, src[0], src[1]);
1755 break;
1756
1757 case BRW_OPCODE_MAD:
1758 assert(devinfo->gen >= 6);
1759 if (devinfo->gen < 10)
1760 brw_set_default_access_mode(p, BRW_ALIGN_16);
1761 brw_MAD(p, dst, src[0], src[1], src[2]);
1762 break;
1763
1764 case BRW_OPCODE_LRP:
1765 assert(devinfo->gen >= 6);
1766 if (devinfo->gen < 10)
1767 brw_set_default_access_mode(p, BRW_ALIGN_16);
1768 brw_LRP(p, dst, src[0], src[1], src[2]);
1769 break;
1770
1771 case BRW_OPCODE_FRC:
1772 brw_FRC(p, dst, src[0]);
1773 break;
1774 case BRW_OPCODE_RNDD:
1775 brw_RNDD(p, dst, src[0]);
1776 break;
1777 case BRW_OPCODE_RNDE:
1778 brw_RNDE(p, dst, src[0]);
1779 break;
1780 case BRW_OPCODE_RNDZ:
1781 brw_RNDZ(p, dst, src[0]);
1782 break;
1783
1784 case BRW_OPCODE_AND:
1785 brw_AND(p, dst, src[0], src[1]);
1786 break;
1787 case BRW_OPCODE_OR:
1788 brw_OR(p, dst, src[0], src[1]);
1789 break;
1790 case BRW_OPCODE_XOR:
1791 brw_XOR(p, dst, src[0], src[1]);
1792 break;
1793 case BRW_OPCODE_NOT:
1794 brw_NOT(p, dst, src[0]);
1795 break;
1796 case BRW_OPCODE_ASR:
1797 brw_ASR(p, dst, src[0], src[1]);
1798 break;
1799 case BRW_OPCODE_SHR:
1800 brw_SHR(p, dst, src[0], src[1]);
1801 break;
1802 case BRW_OPCODE_SHL:
1803 brw_SHL(p, dst, src[0], src[1]);
1804 break;
1805 case BRW_OPCODE_F32TO16:
1806 assert(devinfo->gen >= 7);
1807 brw_F32TO16(p, dst, src[0]);
1808 break;
1809 case BRW_OPCODE_F16TO32:
1810 assert(devinfo->gen >= 7);
1811 brw_F16TO32(p, dst, src[0]);
1812 break;
1813 case BRW_OPCODE_CMP:
1814 if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
1815 dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1816 /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
1817 * implemented in the compiler is not sufficient. Overriding the
1818 * type when the destination is the null register is necessary but
1819 * not sufficient by itself.
1820 */
1821 assert(dst.nr == BRW_ARF_NULL);
1822 dst.type = BRW_REGISTER_TYPE_D;
1823 }
1824 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1825 break;
1826 case BRW_OPCODE_SEL:
1827 brw_SEL(p, dst, src[0], src[1]);
1828 break;
1829 case BRW_OPCODE_BFREV:
1830 assert(devinfo->gen >= 7);
1831 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1832 retype(src[0], BRW_REGISTER_TYPE_UD));
1833 break;
1834 case BRW_OPCODE_FBH:
1835 assert(devinfo->gen >= 7);
1836 brw_FBH(p, retype(dst, src[0].type), src[0]);
1837 break;
1838 case BRW_OPCODE_FBL:
1839 assert(devinfo->gen >= 7);
1840 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
1841 retype(src[0], BRW_REGISTER_TYPE_UD));
1842 break;
1843 case BRW_OPCODE_LZD:
1844 brw_LZD(p, dst, src[0]);
1845 break;
1846 case BRW_OPCODE_CBIT:
1847 assert(devinfo->gen >= 7);
1848 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
1849 retype(src[0], BRW_REGISTER_TYPE_UD));
1850 break;
1851 case BRW_OPCODE_ADDC:
1852 assert(devinfo->gen >= 7);
1853 brw_ADDC(p, dst, src[0], src[1]);
1854 break;
1855 case BRW_OPCODE_SUBB:
1856 assert(devinfo->gen >= 7);
1857 brw_SUBB(p, dst, src[0], src[1]);
1858 break;
1859 case BRW_OPCODE_MAC:
1860 brw_MAC(p, dst, src[0], src[1]);
1861 break;
1862
1863 case BRW_OPCODE_BFE:
1864 assert(devinfo->gen >= 7);
1865 if (devinfo->gen < 10)
1866 brw_set_default_access_mode(p, BRW_ALIGN_16);
1867 brw_BFE(p, dst, src[0], src[1], src[2]);
1868 break;
1869
1870 case BRW_OPCODE_BFI1:
1871 assert(devinfo->gen >= 7);
1872 brw_BFI1(p, dst, src[0], src[1]);
1873 break;
1874 case BRW_OPCODE_BFI2:
1875 assert(devinfo->gen >= 7);
1876 if (devinfo->gen < 10)
1877 brw_set_default_access_mode(p, BRW_ALIGN_16);
1878 brw_BFI2(p, dst, src[0], src[1], src[2]);
1879 break;
1880
1881 case BRW_OPCODE_IF:
1882 if (inst->src[0].file != BAD_FILE) {
1883 /* The instruction has an embedded compare (only allowed on gen6) */
1884 assert(devinfo->gen == 6);
1885 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1886 } else {
1887 brw_IF(p, brw_inst_exec_size(devinfo, p->current));
1888 }
1889 break;
1890
1891 case BRW_OPCODE_ELSE:
1892 brw_ELSE(p);
1893 break;
1894 case BRW_OPCODE_ENDIF:
1895 brw_ENDIF(p);
1896 break;
1897
1898 case BRW_OPCODE_DO:
1899 brw_DO(p, brw_inst_exec_size(devinfo, p->current));
1900 break;
1901
1902 case BRW_OPCODE_BREAK:
1903 brw_BREAK(p);
1904 break;
1905 case BRW_OPCODE_CONTINUE:
1906 brw_CONT(p);
1907 break;
1908
1909 case BRW_OPCODE_WHILE:
1910 brw_WHILE(p);
1911 loop_count++;
1912 break;
1913
1914 case SHADER_OPCODE_RCP:
1915 case SHADER_OPCODE_RSQ:
1916 case SHADER_OPCODE_SQRT:
1917 case SHADER_OPCODE_EXP2:
1918 case SHADER_OPCODE_LOG2:
1919 case SHADER_OPCODE_SIN:
1920 case SHADER_OPCODE_COS:
1921 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1922 if (devinfo->gen >= 6) {
1923 assert(inst->mlen == 0);
1924 assert(devinfo->gen >= 7 || inst->exec_size == 8);
1925 gen6_math(p, dst, brw_math_function(inst->opcode),
1926 src[0], brw_null_reg());
1927 } else {
1928 assert(inst->mlen >= 1);
1929 assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
1930 gen4_math(p, dst,
1931 brw_math_function(inst->opcode),
1932 inst->base_mrf, src[0],
1933 BRW_MATH_PRECISION_FULL);
1934 }
1935 break;
1936 case SHADER_OPCODE_INT_QUOTIENT:
1937 case SHADER_OPCODE_INT_REMAINDER:
1938 case SHADER_OPCODE_POW:
1939 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1940 if (devinfo->gen >= 6) {
1941 assert(inst->mlen == 0);
1942 assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
1943 inst->exec_size == 8);
1944 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1945 } else {
1946 assert(inst->mlen >= 1);
1947 assert(inst->exec_size == 8);
1948 gen4_math(p, dst, brw_math_function(inst->opcode),
1949 inst->base_mrf, src[0],
1950 BRW_MATH_PRECISION_FULL);
1951 }
1952 break;
1953 case FS_OPCODE_CINTERP:
1954 brw_MOV(p, dst, src[0]);
1955 break;
1956 case FS_OPCODE_LINTERP:
1957 generate_linterp(inst, dst, src);
1958 break;
1959 case FS_OPCODE_PIXEL_X:
1960 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1961 src[0].subnr = 0 * type_sz(src[0].type);
1962 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1963 break;
1964 case FS_OPCODE_PIXEL_Y:
1965 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1966 src[0].subnr = 4 * type_sz(src[0].type);
1967 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1968 break;
1969 case FS_OPCODE_GET_BUFFER_SIZE:
1970 generate_get_buffer_size(inst, dst, src[0], src[1]);
1971 break;
1972 case SHADER_OPCODE_TEX:
1973 case FS_OPCODE_TXB:
1974 case SHADER_OPCODE_TXD:
1975 case SHADER_OPCODE_TXF:
1976 case SHADER_OPCODE_TXF_LZ:
1977 case SHADER_OPCODE_TXF_CMS:
1978 case SHADER_OPCODE_TXF_CMS_W:
1979 case SHADER_OPCODE_TXF_UMS:
1980 case SHADER_OPCODE_TXF_MCS:
1981 case SHADER_OPCODE_TXL:
1982 case SHADER_OPCODE_TXL_LZ:
1983 case SHADER_OPCODE_TXS:
1984 case SHADER_OPCODE_LOD:
1985 case SHADER_OPCODE_TG4:
1986 case SHADER_OPCODE_TG4_OFFSET:
1987 case SHADER_OPCODE_SAMPLEINFO:
1988 generate_tex(inst, dst, src[0], src[1], src[2]);
1989 break;
1990 case FS_OPCODE_DDX_COARSE:
1991 case FS_OPCODE_DDX_FINE:
1992 generate_ddx(inst->opcode, dst, src[0]);
1993 break;
1994 case FS_OPCODE_DDY_COARSE:
1995 case FS_OPCODE_DDY_FINE:
1996 generate_ddy(inst->opcode, dst, src[0]);
1997 break;
1998
1999 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
2000 generate_scratch_write(inst, src[0]);
2001 spill_count++;
2002 break;
2003
2004 case SHADER_OPCODE_GEN4_SCRATCH_READ:
2005 generate_scratch_read(inst, dst);
2006 fill_count++;
2007 break;
2008
2009 case SHADER_OPCODE_GEN7_SCRATCH_READ:
2010 generate_scratch_read_gen7(inst, dst);
2011 fill_count++;
2012 break;
2013
2014 case SHADER_OPCODE_MOV_INDIRECT:
2015 generate_mov_indirect(inst, dst, src[0], src[1]);
2016 break;
2017
2018 case SHADER_OPCODE_URB_READ_SIMD8:
2019 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
2020 generate_urb_read(inst, dst, src[0]);
2021 break;
2022
2023 case SHADER_OPCODE_URB_WRITE_SIMD8:
2024 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
2025 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
2026 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
2027 generate_urb_write(inst, src[0]);
2028 break;
2029
2030 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2031 assert(inst->force_writemask_all);
2032 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
2033 break;
2034
2035 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2036 assert(inst->force_writemask_all);
2037 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2038 break;
2039
2040 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
2041 generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
2042 break;
2043
2044 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
2045 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2046 break;
2047
2048 case FS_OPCODE_REP_FB_WRITE:
2049 case FS_OPCODE_FB_WRITE:
2050 generate_fb_write(inst, src[0]);
2051 break;
2052
2053 case FS_OPCODE_FB_READ:
2054 generate_fb_read(inst, dst, src[0]);
2055 break;
2056
2057 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
2058 generate_mov_dispatch_to_flags(inst);
2059 break;
2060
2061 case FS_OPCODE_DISCARD_JUMP:
2062 generate_discard_jump(inst);
2063 break;
2064
2065 case SHADER_OPCODE_SHADER_TIME_ADD:
2066 generate_shader_time_add(inst, src[0], src[1], src[2]);
2067 break;
2068
2069 case SHADER_OPCODE_UNTYPED_ATOMIC:
2070 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2071 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
2072 inst->mlen, !inst->dst.is_null());
2073 break;
2074
2075 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
2076 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2077 brw_untyped_surface_read(p, dst, src[0], src[1],
2078 inst->mlen, src[2].ud);
2079 break;
2080
2081 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
2082 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2083 brw_untyped_surface_write(p, src[0], src[1],
2084 inst->mlen, src[2].ud);
2085 break;
2086
2087 case SHADER_OPCODE_TYPED_ATOMIC:
2088 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2089 brw_typed_atomic(p, dst, src[0], src[1],
2090 src[2].ud, inst->mlen, !inst->dst.is_null());
2091 break;
2092
2093 case SHADER_OPCODE_TYPED_SURFACE_READ:
2094 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2095 brw_typed_surface_read(p, dst, src[0], src[1],
2096 inst->mlen, src[2].ud);
2097 break;
2098
2099 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
2100 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2101 brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
2102 break;
2103
2104 case SHADER_OPCODE_MEMORY_FENCE:
2105 brw_memory_fence(p, dst);
2106 break;
2107
2108 case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
2109 const struct brw_reg mask =
2110 brw_stage_has_packed_dispatch(devinfo, stage,
2111 prog_data) ? brw_imm_ud(~0u) :
2112 stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
2113 brw_dmask_reg();
2114 brw_find_live_channel(p, dst, mask);
2115 break;
2116 }
2117
2118 case SHADER_OPCODE_BROADCAST:
2119 assert(inst->force_writemask_all);
2120 brw_broadcast(p, dst, src[0], src[1]);
2121 break;
2122
2123 case FS_OPCODE_SET_SAMPLE_ID:
2124 generate_set_sample_id(inst, dst, src[0], src[1]);
2125 break;
2126
2127 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2128 generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2129 break;
2130
2131 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2132 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2133 generate_unpack_half_2x16_split(inst, dst, src[0]);
2134 break;
2135
2136 case FS_OPCODE_PLACEHOLDER_HALT:
2137 /* This is the place where the final HALT needs to be inserted if
2138 * we've emitted any discards. If not, this will emit no code.
2139 */
2140 if (!patch_discard_jumps_to_fb_writes()) {
2141 if (unlikely(debug_flag)) {
2142 annotation.ann_count--;
2143 }
2144 }
2145 break;
2146
2147 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2148 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2149 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2150 break;
2151
2152 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2153 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2154 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2155 break;
2156
2157 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2158 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2159 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2160 break;
2161
2162 case CS_OPCODE_CS_TERMINATE:
2163 generate_cs_terminate(inst, src[0]);
2164 break;
2165
2166 case SHADER_OPCODE_BARRIER:
2167 generate_barrier(inst, src[0]);
2168 break;
2169
2170 case BRW_OPCODE_DIM:
2171 assert(devinfo->is_haswell);
2172 assert(src[0].type == BRW_REGISTER_TYPE_DF);
2173 assert(dst.type == BRW_REGISTER_TYPE_DF);
2174 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2175 break;
2176
2177 default:
2178 unreachable("Unsupported opcode");
2179
2180 case SHADER_OPCODE_LOAD_PAYLOAD:
2181 unreachable("Should be lowered by lower_load_payload()");
2182 }
2183
2184 if (multiple_instructions_emitted)
2185 continue;
2186
2187 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2188 assert(p->next_insn_offset == last_insn_offset + 16 ||
2189 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2190 "emitting more than 1 instruction");
2191
2192 brw_inst *last = &p->store[last_insn_offset / 16];
2193
2194 if (inst->conditional_mod)
2195 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2196 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2197 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2198 }
2199 }
2200
2201 brw_set_uip_jip(p, start_offset);
2202 annotation_finalize(&annotation, p->next_insn_offset);
2203
2204 #ifndef NDEBUG
2205 bool validated = brw_validate_instructions(devinfo, p->store,
2206 start_offset,
2207 p->next_insn_offset,
2208 &annotation);
2209 #else
2210 if (unlikely(debug_flag))
2211 brw_validate_instructions(devinfo, p->store,
2212 start_offset,
2213 p->next_insn_offset,
2214 &annotation);
2215 #endif
2216
2217 int before_size = p->next_insn_offset - start_offset;
2218 brw_compact_instructions(p, start_offset, annotation.ann_count,
2219 annotation.ann);
2220 int after_size = p->next_insn_offset - start_offset;
2221
2222 if (unlikely(debug_flag)) {
2223 fprintf(stderr, "Native code for %s\n"
2224 "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
2225 " bytes (%.0f%%)\n",
2226 shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
2227 spill_count, fill_count, promoted_constants, before_size, after_size,
2228 100.0f * (before_size - after_size) / before_size);
2229
2230 dump_assembly(p->store, annotation.ann_count, annotation.ann,
2231 p->devinfo);
2232 ralloc_free(annotation.mem_ctx);
2233 }
2234 assert(validated);
2235
2236 compiler->shader_debug_log(log_data,
2237 "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2238 "%d:%d spills:fills, Promoted %u constants, "
2239 "compacted %d to %d bytes.",
2240 _mesa_shader_stage_to_abbrev(stage),
2241 dispatch_width, before_size / 16,
2242 loop_count, cfg->cycle_count, spill_count,
2243 fill_count, promoted_constants, before_size,
2244 after_size);
2245
2246 return start_offset;
2247 }
2248
2249 const unsigned *
2250 fs_generator::get_assembly(unsigned int *assembly_size)
2251 {
2252 return brw_get_program(p, assembly_size);
2253 }