i965/fs: Use type-W for immediate in SampleID setup.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_generator.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_generator.cpp
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 #include "main/macros.h"
31 #include "brw_context.h"
32 #include "brw_eu.h"
33 #include "brw_fs.h"
34 #include "brw_cfg.h"
35
36 static uint32_t brw_file_from_reg(fs_reg *reg)
37 {
38 switch (reg->file) {
39 case GRF:
40 return BRW_GENERAL_REGISTER_FILE;
41 case MRF:
42 return BRW_MESSAGE_REGISTER_FILE;
43 case IMM:
44 return BRW_IMMEDIATE_VALUE;
45 default:
46 unreachable("not reached");
47 }
48 }
49
50 static struct brw_reg
51 brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen)
52 {
53 struct brw_reg brw_reg;
54
55 switch (reg->file) {
56 case MRF:
57 assert((reg->reg & ~(1 << 7)) < BRW_MAX_MRF(gen));
58 /* Fallthrough */
59 case GRF:
60 if (reg->stride == 0) {
61 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
62 } else if (inst->exec_size < 8) {
63 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
64 brw_reg = stride(brw_reg, inst->exec_size * reg->stride,
65 inst->exec_size, reg->stride);
66 } else {
67 /* From the Haswell PRM:
68 *
69 * VertStride must be used to cross GRF register boundaries. This
70 * rule implies that elements within a 'Width' cannot cross GRF
71 * boundaries.
72 *
73 * So, for registers with width > 8, we have to use a width of 8
74 * and trust the compression state to sort out the exec size.
75 */
76 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
77 brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
78 }
79
80 brw_reg = retype(brw_reg, reg->type);
81 brw_reg = byte_offset(brw_reg, reg->subreg_offset);
82 break;
83 case IMM:
84 assert(reg->stride == ((reg->type == BRW_REGISTER_TYPE_V ||
85 reg->type == BRW_REGISTER_TYPE_UV ||
86 reg->type == BRW_REGISTER_TYPE_VF) ? 1 : 0));
87
88 switch (reg->type) {
89 case BRW_REGISTER_TYPE_F:
90 brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
91 break;
92 case BRW_REGISTER_TYPE_D:
93 brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d);
94 break;
95 case BRW_REGISTER_TYPE_UD:
96 brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud);
97 break;
98 case BRW_REGISTER_TYPE_W:
99 brw_reg = brw_imm_w(reg->fixed_hw_reg.dw1.d);
100 break;
101 case BRW_REGISTER_TYPE_UW:
102 brw_reg = brw_imm_uw(reg->fixed_hw_reg.dw1.ud);
103 break;
104 case BRW_REGISTER_TYPE_VF:
105 brw_reg = brw_imm_vf(reg->fixed_hw_reg.dw1.ud);
106 break;
107 default:
108 unreachable("not reached");
109 }
110 break;
111 case HW_REG:
112 assert(reg->type == reg->fixed_hw_reg.type);
113 brw_reg = reg->fixed_hw_reg;
114 break;
115 case BAD_FILE:
116 /* Probably unused. */
117 brw_reg = brw_null_reg();
118 break;
119 default:
120 unreachable("not reached");
121 }
122 if (reg->abs)
123 brw_reg = brw_abs(brw_reg);
124 if (reg->negate)
125 brw_reg = negate(brw_reg);
126
127 return brw_reg;
128 }
129
130 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
131 void *mem_ctx,
132 const void *key,
133 struct brw_stage_prog_data *prog_data,
134 unsigned promoted_constants,
135 bool runtime_check_aads_emit,
136 const char *stage_abbrev)
137
138 : compiler(compiler), log_data(log_data),
139 devinfo(compiler->devinfo), key(key),
140 prog_data(prog_data),
141 promoted_constants(promoted_constants),
142 runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
143 stage_abbrev(stage_abbrev), mem_ctx(mem_ctx)
144 {
145 p = rzalloc(mem_ctx, struct brw_codegen);
146 brw_init_codegen(devinfo, p, mem_ctx);
147 }
148
149 fs_generator::~fs_generator()
150 {
151 }
152
153 class ip_record : public exec_node {
154 public:
155 DECLARE_RALLOC_CXX_OPERATORS(ip_record)
156
157 ip_record(int ip)
158 {
159 this->ip = ip;
160 }
161
162 int ip;
163 };
164
165 bool
166 fs_generator::patch_discard_jumps_to_fb_writes()
167 {
168 if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
169 return false;
170
171 int scale = brw_jump_scale(p->devinfo);
172
173 /* There is a somewhat strange undocumented requirement of using
174 * HALT, according to the simulator. If some channel has HALTed to
175 * a particular UIP, then by the end of the program, every channel
176 * must have HALTed to that UIP. Furthermore, the tracking is a
177 * stack, so you can't do the final halt of a UIP after starting
178 * halting to a new UIP.
179 *
180 * Symptoms of not emitting this instruction on actual hardware
181 * included GPU hangs and sparkly rendering on the piglit discard
182 * tests.
183 */
184 brw_inst *last_halt = gen6_HALT(p);
185 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
186 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
187
188 int ip = p->nr_insn;
189
190 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
191 brw_inst *patch = &p->store[patch_ip->ip];
192
193 assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
194 /* HALT takes a half-instruction distance from the pre-incremented IP. */
195 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
196 }
197
198 this->discard_halt_patches.make_empty();
199 return true;
200 }
201
202 void
203 fs_generator::fire_fb_write(fs_inst *inst,
204 struct brw_reg payload,
205 struct brw_reg implied_header,
206 GLuint nr)
207 {
208 uint32_t msg_control;
209
210 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
211
212 if (devinfo->gen < 6) {
213 brw_push_insn_state(p);
214 brw_set_default_exec_size(p, BRW_EXECUTE_8);
215 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
216 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
217 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
218 brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
219 brw_pop_insn_state(p);
220 }
221
222 if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
223 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
224 else if (prog_data->dual_src_blend) {
225 if (!inst->force_sechalf)
226 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
227 else
228 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
229 } else if (inst->exec_size == 16)
230 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
231 else
232 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
233
234 uint32_t surf_index =
235 prog_data->binding_table.render_target_start + inst->target;
236
237 bool last_render_target = inst->eot ||
238 (prog_data->dual_src_blend && dispatch_width == 16);
239
240
241 brw_fb_WRITE(p,
242 dispatch_width,
243 payload,
244 implied_header,
245 msg_control,
246 surf_index,
247 nr,
248 0,
249 inst->eot,
250 last_render_target,
251 inst->header_size != 0);
252
253 brw_mark_surface_used(&prog_data->base, surf_index);
254 }
255
256 void
257 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
258 {
259 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
260 const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
261 struct brw_reg implied_header;
262
263 if (devinfo->gen < 8 && !devinfo->is_haswell) {
264 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
265 }
266
267 if (inst->base_mrf >= 0)
268 payload = brw_message_reg(inst->base_mrf);
269
270 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
271 * move, here's g1.
272 */
273 if (inst->header_size != 0) {
274 brw_push_insn_state(p);
275 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
276 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
277 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
278 brw_set_default_flag_reg(p, 0, 0);
279
280 /* On HSW, the GPU will use the predicate on SENDC, unless the header is
281 * present.
282 */
283 if (prog_data->uses_kill) {
284 struct brw_reg pixel_mask;
285
286 if (devinfo->gen >= 6)
287 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
288 else
289 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
290
291 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
292 }
293
294 if (devinfo->gen >= 6) {
295 brw_push_insn_state(p);
296 brw_set_default_exec_size(p, BRW_EXECUTE_16);
297 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
298 brw_MOV(p,
299 retype(payload, BRW_REGISTER_TYPE_UD),
300 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
301 brw_pop_insn_state(p);
302
303 if (inst->target > 0 && key->replicate_alpha) {
304 /* Set "Source0 Alpha Present to RenderTarget" bit in message
305 * header.
306 */
307 brw_OR(p,
308 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
309 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
310 brw_imm_ud(0x1 << 11));
311 }
312
313 if (inst->target > 0) {
314 /* Set the render target index for choosing BLEND_STATE. */
315 brw_MOV(p, retype(vec1(suboffset(payload, 2)),
316 BRW_REGISTER_TYPE_UD),
317 brw_imm_ud(inst->target));
318 }
319
320 /* Set computes stencil to render target */
321 if (prog_data->computed_stencil) {
322 brw_OR(p,
323 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
324 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
325 brw_imm_ud(0x1 << 14));
326 }
327
328 implied_header = brw_null_reg();
329 } else {
330 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
331 }
332
333 brw_pop_insn_state(p);
334 } else {
335 implied_header = brw_null_reg();
336 }
337
338 if (!runtime_check_aads_emit) {
339 fire_fb_write(inst, payload, implied_header, inst->mlen);
340 } else {
341 /* This can only happen in gen < 6 */
342 assert(devinfo->gen < 6);
343
344 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
345
346 /* Check runtime bit to detect if we have to send AA data or not */
347 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
348 brw_AND(p,
349 v1_null_ud,
350 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
351 brw_imm_ud(1<<26));
352 brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
353
354 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
355 brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
356 {
357 /* Don't send AA data */
358 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
359 }
360 brw_land_fwd_jump(p, jmp);
361 fire_fb_write(inst, payload, implied_header, inst->mlen);
362 }
363 }
364
365 void
366 fs_generator::generate_urb_read(fs_inst *inst,
367 struct brw_reg dst,
368 struct brw_reg header)
369 {
370 assert(header.file == BRW_GENERAL_REGISTER_FILE);
371 assert(header.type == BRW_REGISTER_TYPE_UD);
372
373 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
374 brw_set_dest(p, send, dst);
375 brw_set_src0(p, send, header);
376 brw_set_src1(p, send, brw_imm_ud(0u));
377
378 brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
379 brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
380
381 brw_inst_set_mlen(p->devinfo, send, inst->mlen);
382 brw_inst_set_rlen(p->devinfo, send, inst->regs_written);
383 brw_inst_set_header_present(p->devinfo, send, true);
384 brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
385 }
386
387 void
388 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
389 {
390 brw_inst *insn;
391
392 insn = brw_next_insn(p, BRW_OPCODE_SEND);
393
394 brw_set_dest(p, insn, brw_null_reg());
395 brw_set_src0(p, insn, payload);
396 brw_set_src1(p, insn, brw_imm_d(0));
397
398 brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
399 brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
400
401 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
402 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
403 brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
404
405 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
406 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
407 brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
408
409 brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
410 brw_inst_set_rlen(p->devinfo, insn, 0);
411 brw_inst_set_eot(p->devinfo, insn, inst->eot);
412 brw_inst_set_header_present(p->devinfo, insn, true);
413 brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
414 }
415
416 void
417 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
418 {
419 struct brw_inst *insn;
420
421 insn = brw_next_insn(p, BRW_OPCODE_SEND);
422
423 brw_set_dest(p, insn, brw_null_reg());
424 brw_set_src0(p, insn, payload);
425 brw_set_src1(p, insn, brw_imm_d(0));
426
427 /* Terminate a compute shader by sending a message to the thread spawner.
428 */
429 brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
430 brw_inst_set_mlen(devinfo, insn, 1);
431 brw_inst_set_rlen(devinfo, insn, 0);
432 brw_inst_set_eot(devinfo, insn, inst->eot);
433 brw_inst_set_header_present(devinfo, insn, false);
434
435 brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
436 brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
437
438 /* Note that even though the thread has a URB resource associated with it,
439 * we set the "do not dereference URB" bit, because the URB resource is
440 * managed by the fixed-function unit, so it will free it automatically.
441 */
442 brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
443
444 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
445 }
446
447 void
448 fs_generator::generate_stencil_ref_packing(fs_inst *inst,
449 struct brw_reg dst,
450 struct brw_reg src)
451 {
452 assert(dispatch_width == 8);
453 assert(devinfo->gen >= 9);
454
455 /* Stencil value updates are provided in 8 slots of 1 byte per slot.
456 * Presumably, in order to save memory bandwidth, the stencil reference
457 * values written from the FS need to be packed into 2 dwords (this makes
458 * sense because the stencil values are limited to 1 byte each and a SIMD8
459 * send, so stencil slots 0-3 in dw0, and 4-7 in dw1.)
460 *
461 * The spec is confusing here because in the payload definition of MDP_RTW_S8
462 * (Message Data Payload for Render Target Writes with Stencil 8b) the
463 * stencil value seems to be dw4.0-dw4.7. However, if you look at the type of
464 * dw4 it is type MDPR_STENCIL (Message Data Payload Register) which is the
465 * packed values specified above and diagrammed below:
466 *
467 * 31 0
468 * --------------------------------
469 * DW | |
470 * 2-7 | IGNORED |
471 * | |
472 * --------------------------------
473 * DW1 | STC | STC | STC | STC |
474 * | slot7 | slot6 | slot5 | slot4|
475 * --------------------------------
476 * DW0 | STC | STC | STC | STC |
477 * | slot3 | slot2 | slot1 | slot0|
478 * --------------------------------
479 */
480
481 src.vstride = BRW_VERTICAL_STRIDE_4;
482 src.width = BRW_WIDTH_1;
483 src.hstride = BRW_HORIZONTAL_STRIDE_0;
484 assert(src.type == BRW_REGISTER_TYPE_UB);
485 brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UB), src);
486 }
487
488 void
489 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
490 {
491 brw_barrier(p, src);
492 brw_WAIT(p);
493 }
494
495 void
496 fs_generator::generate_blorp_fb_write(fs_inst *inst)
497 {
498 brw_fb_WRITE(p,
499 16 /* dispatch_width */,
500 brw_message_reg(inst->base_mrf),
501 brw_reg_from_fs_reg(inst, &inst->src[0], devinfo->gen),
502 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
503 inst->target,
504 inst->mlen,
505 0,
506 true,
507 true,
508 inst->header_size != 0);
509 }
510
511 void
512 fs_generator::generate_linterp(fs_inst *inst,
513 struct brw_reg dst, struct brw_reg *src)
514 {
515 /* PLN reads:
516 * / in SIMD16 \
517 * -----------------------------------
518 * | src1+0 | src1+1 | src1+2 | src1+3 |
519 * |-----------------------------------|
520 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
521 * -----------------------------------
522 *
523 * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
524 *
525 * -----------------------------------
526 * | src1+0 | src1+1 | src1+2 | src1+3 |
527 * |-----------------------------------|
528 * |(x0, x1)|(y0, y1)| | | in SIMD8
529 * |-----------------------------------|
530 * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
531 * -----------------------------------
532 *
533 * See also: emit_interpolation_setup_gen4().
534 */
535 struct brw_reg delta_x = src[0];
536 struct brw_reg delta_y = offset(src[0], dispatch_width / 8);
537 struct brw_reg interp = src[1];
538
539 if (devinfo->has_pln &&
540 (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
541 brw_PLN(p, dst, interp, delta_x);
542 } else {
543 brw_LINE(p, brw_null_reg(), interp, delta_x);
544 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
545 }
546 }
547
548 void
549 fs_generator::generate_math_gen6(fs_inst *inst,
550 struct brw_reg dst,
551 struct brw_reg src0,
552 struct brw_reg src1)
553 {
554 int op = brw_math_function(inst->opcode);
555 bool binop = src1.file != BRW_ARCHITECTURE_REGISTER_FILE;
556
557 if (dispatch_width == 8) {
558 gen6_math(p, dst, op, src0, src1);
559 } else if (dispatch_width == 16) {
560 brw_push_insn_state(p);
561 brw_set_default_exec_size(p, BRW_EXECUTE_8);
562 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
563 gen6_math(p, firsthalf(dst), op, firsthalf(src0), firsthalf(src1));
564 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
565 gen6_math(p, sechalf(dst), op, sechalf(src0),
566 binop ? sechalf(src1) : brw_null_reg());
567 brw_pop_insn_state(p);
568 }
569 }
570
571 void
572 fs_generator::generate_math_gen4(fs_inst *inst,
573 struct brw_reg dst,
574 struct brw_reg src)
575 {
576 int op = brw_math_function(inst->opcode);
577
578 assert(inst->mlen >= 1);
579
580 if (dispatch_width == 8) {
581 gen4_math(p, dst,
582 op,
583 inst->base_mrf, src,
584 BRW_MATH_PRECISION_FULL);
585 } else if (dispatch_width == 16) {
586 brw_set_default_exec_size(p, BRW_EXECUTE_8);
587 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
588 gen4_math(p, firsthalf(dst),
589 op,
590 inst->base_mrf, firsthalf(src),
591 BRW_MATH_PRECISION_FULL);
592 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
593 gen4_math(p, sechalf(dst),
594 op,
595 inst->base_mrf + 1, sechalf(src),
596 BRW_MATH_PRECISION_FULL);
597
598 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
599 }
600 }
601
602 void
603 fs_generator::generate_math_g45(fs_inst *inst,
604 struct brw_reg dst,
605 struct brw_reg src)
606 {
607 if (inst->opcode == SHADER_OPCODE_POW ||
608 inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
609 inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
610 generate_math_gen4(inst, dst, src);
611 return;
612 }
613
614 int op = brw_math_function(inst->opcode);
615
616 assert(inst->mlen >= 1);
617
618 gen4_math(p, dst,
619 op,
620 inst->base_mrf, src,
621 BRW_MATH_PRECISION_FULL);
622 }
623
624 void
625 fs_generator::generate_get_buffer_size(fs_inst *inst,
626 struct brw_reg dst,
627 struct brw_reg src,
628 struct brw_reg surf_index)
629 {
630 assert(devinfo->gen >= 7);
631 assert(surf_index.file == BRW_IMMEDIATE_VALUE);
632
633 uint32_t simd_mode;
634 int rlen = 4;
635
636 switch (inst->exec_size) {
637 case 8:
638 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
639 break;
640 case 16:
641 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
642 break;
643 default:
644 unreachable("Invalid width for texture instruction");
645 }
646
647 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
648 rlen = 8;
649 dst = vec16(dst);
650 }
651
652 brw_SAMPLE(p,
653 retype(dst, BRW_REGISTER_TYPE_UW),
654 inst->base_mrf,
655 src,
656 surf_index.dw1.ud,
657 0,
658 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
659 rlen, /* response length */
660 inst->mlen,
661 inst->header_size > 0,
662 simd_mode,
663 BRW_SAMPLER_RETURN_FORMAT_SINT32);
664
665 brw_mark_surface_used(prog_data, surf_index.dw1.ud);
666 }
667
668 void
669 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
670 struct brw_reg sampler_index)
671 {
672 int msg_type = -1;
673 int rlen = 4;
674 uint32_t simd_mode;
675 uint32_t return_format;
676 bool is_combined_send = inst->eot;
677
678 switch (dst.type) {
679 case BRW_REGISTER_TYPE_D:
680 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
681 break;
682 case BRW_REGISTER_TYPE_UD:
683 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
684 break;
685 default:
686 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
687 break;
688 }
689
690 switch (inst->exec_size) {
691 case 8:
692 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
693 break;
694 case 16:
695 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
696 break;
697 default:
698 unreachable("Invalid width for texture instruction");
699 }
700
701 if (devinfo->gen >= 5) {
702 switch (inst->opcode) {
703 case SHADER_OPCODE_TEX:
704 if (inst->shadow_compare) {
705 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
706 } else {
707 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
708 }
709 break;
710 case FS_OPCODE_TXB:
711 if (inst->shadow_compare) {
712 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
713 } else {
714 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
715 }
716 break;
717 case SHADER_OPCODE_TXL:
718 if (inst->shadow_compare) {
719 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
720 } else {
721 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
722 }
723 break;
724 case SHADER_OPCODE_TXS:
725 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
726 break;
727 case SHADER_OPCODE_TXD:
728 if (inst->shadow_compare) {
729 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
730 assert(devinfo->gen >= 8 || devinfo->is_haswell);
731 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
732 } else {
733 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
734 }
735 break;
736 case SHADER_OPCODE_TXF:
737 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
738 break;
739 case SHADER_OPCODE_TXF_CMS:
740 if (devinfo->gen >= 7)
741 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
742 else
743 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
744 break;
745 case SHADER_OPCODE_TXF_UMS:
746 assert(devinfo->gen >= 7);
747 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
748 break;
749 case SHADER_OPCODE_TXF_MCS:
750 assert(devinfo->gen >= 7);
751 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
752 break;
753 case SHADER_OPCODE_LOD:
754 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
755 break;
756 case SHADER_OPCODE_TG4:
757 if (inst->shadow_compare) {
758 assert(devinfo->gen >= 7);
759 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
760 } else {
761 assert(devinfo->gen >= 6);
762 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
763 }
764 break;
765 case SHADER_OPCODE_TG4_OFFSET:
766 assert(devinfo->gen >= 7);
767 if (inst->shadow_compare) {
768 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
769 } else {
770 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
771 }
772 break;
773 case SHADER_OPCODE_SAMPLEINFO:
774 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
775 break;
776 default:
777 unreachable("not reached");
778 }
779 } else {
780 switch (inst->opcode) {
781 case SHADER_OPCODE_TEX:
782 /* Note that G45 and older determines shadow compare and dispatch width
783 * from message length for most messages.
784 */
785 if (inst->exec_size == 8) {
786 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
787 if (inst->shadow_compare) {
788 assert(inst->mlen == 6);
789 } else {
790 assert(inst->mlen <= 4);
791 }
792 } else {
793 if (inst->shadow_compare) {
794 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
795 assert(inst->mlen == 9);
796 } else {
797 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
798 assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
799 }
800 }
801 break;
802 case FS_OPCODE_TXB:
803 if (inst->shadow_compare) {
804 assert(inst->exec_size == 8);
805 assert(inst->mlen == 6);
806 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
807 } else {
808 assert(inst->mlen == 9);
809 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
810 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
811 }
812 break;
813 case SHADER_OPCODE_TXL:
814 if (inst->shadow_compare) {
815 assert(inst->exec_size == 8);
816 assert(inst->mlen == 6);
817 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
818 } else {
819 assert(inst->mlen == 9);
820 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
821 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
822 }
823 break;
824 case SHADER_OPCODE_TXD:
825 /* There is no sample_d_c message; comparisons are done manually */
826 assert(inst->exec_size == 8);
827 assert(inst->mlen == 7 || inst->mlen == 10);
828 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
829 break;
830 case SHADER_OPCODE_TXF:
831 assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
832 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
833 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
834 break;
835 case SHADER_OPCODE_TXS:
836 assert(inst->mlen == 3);
837 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
838 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
839 break;
840 default:
841 unreachable("not reached");
842 }
843 }
844 assert(msg_type != -1);
845
846 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
847 rlen = 8;
848 dst = vec16(dst);
849 }
850
851 if (is_combined_send) {
852 assert(devinfo->gen >= 9 || devinfo->is_cherryview);
853 rlen = 0;
854 }
855
856 assert(devinfo->gen < 7 || inst->header_size == 0 ||
857 src.file == BRW_GENERAL_REGISTER_FILE);
858
859 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
860
861 /* Load the message header if present. If there's a texture offset,
862 * we need to set it up explicitly and load the offset bitfield.
863 * Otherwise, we can use an implied move from g0 to the first message reg.
864 */
865 if (inst->header_size != 0) {
866 if (devinfo->gen < 6 && !inst->offset) {
867 /* Set up an implied move from g0 to the MRF. */
868 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
869 } else {
870 struct brw_reg header_reg;
871
872 if (devinfo->gen >= 7) {
873 header_reg = src;
874 } else {
875 assert(inst->base_mrf != -1);
876 header_reg = brw_message_reg(inst->base_mrf);
877 }
878
879 brw_push_insn_state(p);
880 brw_set_default_exec_size(p, BRW_EXECUTE_8);
881 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
882 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
883 /* Explicitly set up the message header by copying g0 to the MRF. */
884 brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
885
886 if (inst->offset) {
887 /* Set the offset bits in DWord 2. */
888 brw_MOV(p, get_element_ud(header_reg, 2),
889 brw_imm_ud(inst->offset));
890 }
891
892 brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
893 brw_pop_insn_state(p);
894 }
895 }
896
897 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
898 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
899 ? prog_data->binding_table.gather_texture_start
900 : prog_data->binding_table.texture_start;
901
902 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
903 uint32_t sampler = sampler_index.dw1.ud;
904
905 brw_SAMPLE(p,
906 retype(dst, BRW_REGISTER_TYPE_UW),
907 inst->base_mrf,
908 src,
909 sampler + base_binding_table_index,
910 sampler % 16,
911 msg_type,
912 rlen,
913 inst->mlen,
914 inst->header_size != 0,
915 simd_mode,
916 return_format);
917
918 brw_mark_surface_used(prog_data, sampler + base_binding_table_index);
919 } else {
920 /* Non-const sampler index */
921
922 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
923 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
924
925 brw_push_insn_state(p);
926 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
927 brw_set_default_access_mode(p, BRW_ALIGN_1);
928
929 /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
930 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
931 if (base_binding_table_index)
932 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
933 brw_AND(p, addr, addr, brw_imm_ud(0xfff));
934
935 brw_pop_insn_state(p);
936
937 /* dst = send(offset, a0.0 | <descriptor>) */
938 brw_inst *insn = brw_send_indirect_message(
939 p, BRW_SFID_SAMPLER, dst, src, addr);
940 brw_set_sampler_message(p, insn,
941 0 /* surface */,
942 0 /* sampler */,
943 msg_type,
944 rlen,
945 inst->mlen /* mlen */,
946 inst->header_size != 0 /* header */,
947 simd_mode,
948 return_format);
949
950 /* visitor knows more than we do about the surface limit required,
951 * so has already done marking.
952 */
953 }
954
955 if (is_combined_send) {
956 brw_inst_set_eot(p->devinfo, brw_last_inst, true);
957 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
958 }
959 }
960
961
962 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
963 * looking like:
964 *
965 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
966 *
967 * Ideally, we want to produce:
968 *
969 * DDX DDY
970 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
971 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
972 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
973 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
974 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
975 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
976 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
977 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
978 *
979 * and add another set of two more subspans if in 16-pixel dispatch mode.
980 *
981 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
982 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
983 * pair. But the ideal approximation may impose a huge performance cost on
984 * sample_d. On at least Haswell, sample_d instruction does some
985 * optimizations if the same LOD is used for all pixels in the subspan.
986 *
987 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
988 * appropriate swizzling.
989 */
990 void
991 fs_generator::generate_ddx(enum opcode opcode,
992 struct brw_reg dst, struct brw_reg src)
993 {
994 unsigned vstride, width;
995
996 if (opcode == FS_OPCODE_DDX_FINE) {
997 /* produce accurate derivatives */
998 vstride = BRW_VERTICAL_STRIDE_2;
999 width = BRW_WIDTH_2;
1000 } else {
1001 /* replicate the derivative at the top-left pixel to other pixels */
1002 vstride = BRW_VERTICAL_STRIDE_4;
1003 width = BRW_WIDTH_4;
1004 }
1005
1006 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1007 src.negate, src.abs,
1008 BRW_REGISTER_TYPE_F,
1009 vstride,
1010 width,
1011 BRW_HORIZONTAL_STRIDE_0,
1012 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1013 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1014 src.negate, src.abs,
1015 BRW_REGISTER_TYPE_F,
1016 vstride,
1017 width,
1018 BRW_HORIZONTAL_STRIDE_0,
1019 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1020 brw_ADD(p, dst, src0, negate(src1));
1021 }
1022
1023 /* The negate_value boolean is used to negate the derivative computation for
1024 * FBOs, since they place the origin at the upper left instead of the lower
1025 * left.
1026 */
1027 void
1028 fs_generator::generate_ddy(enum opcode opcode,
1029 struct brw_reg dst, struct brw_reg src,
1030 bool negate_value)
1031 {
1032 if (opcode == FS_OPCODE_DDY_FINE) {
1033 /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
1034 * Region Restrictions):
1035 *
1036 * In Align16 access mode, SIMD16 is not allowed for DW operations
1037 * and SIMD8 is not allowed for DF operations.
1038 *
1039 * In this context, "DW operations" means "operations acting on 32-bit
1040 * values", so it includes operations on floats.
1041 *
1042 * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3
1043 * (Instruction Compression -> Rules and Restrictions):
1044 *
1045 * A compressed instruction must be in Align1 access mode. Align16
1046 * mode instructions cannot be compressed.
1047 *
1048 * Similar text exists in the g45 PRM.
1049 *
1050 * On these platforms, if we're building a SIMD16 shader, we need to
1051 * manually unroll to a pair of SIMD8 instructions.
1052 */
1053 bool unroll_to_simd8 =
1054 (dispatch_width == 16 &&
1055 (devinfo->gen == 4 || (devinfo->gen == 7 && !devinfo->is_haswell)));
1056
1057 /* produce accurate derivatives */
1058 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1059 src.negate, src.abs,
1060 BRW_REGISTER_TYPE_F,
1061 BRW_VERTICAL_STRIDE_4,
1062 BRW_WIDTH_4,
1063 BRW_HORIZONTAL_STRIDE_1,
1064 BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
1065 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1066 src.negate, src.abs,
1067 BRW_REGISTER_TYPE_F,
1068 BRW_VERTICAL_STRIDE_4,
1069 BRW_WIDTH_4,
1070 BRW_HORIZONTAL_STRIDE_1,
1071 BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
1072 brw_push_insn_state(p);
1073 brw_set_default_access_mode(p, BRW_ALIGN_16);
1074 if (unroll_to_simd8) {
1075 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1076 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1077 if (negate_value) {
1078 brw_ADD(p, firsthalf(dst), firsthalf(src1), negate(firsthalf(src0)));
1079 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1080 brw_ADD(p, sechalf(dst), sechalf(src1), negate(sechalf(src0)));
1081 } else {
1082 brw_ADD(p, firsthalf(dst), firsthalf(src0), negate(firsthalf(src1)));
1083 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1084 brw_ADD(p, sechalf(dst), sechalf(src0), negate(sechalf(src1)));
1085 }
1086 } else {
1087 if (negate_value)
1088 brw_ADD(p, dst, src1, negate(src0));
1089 else
1090 brw_ADD(p, dst, src0, negate(src1));
1091 }
1092 brw_pop_insn_state(p);
1093 } else {
1094 /* replicate the derivative at the top-left pixel to other pixels */
1095 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1096 src.negate, src.abs,
1097 BRW_REGISTER_TYPE_F,
1098 BRW_VERTICAL_STRIDE_4,
1099 BRW_WIDTH_4,
1100 BRW_HORIZONTAL_STRIDE_0,
1101 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1102 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1103 src.negate, src.abs,
1104 BRW_REGISTER_TYPE_F,
1105 BRW_VERTICAL_STRIDE_4,
1106 BRW_WIDTH_4,
1107 BRW_HORIZONTAL_STRIDE_0,
1108 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1109 if (negate_value)
1110 brw_ADD(p, dst, src1, negate(src0));
1111 else
1112 brw_ADD(p, dst, src0, negate(src1));
1113 }
1114 }
1115
1116 void
1117 fs_generator::generate_discard_jump(fs_inst *inst)
1118 {
1119 assert(devinfo->gen >= 6);
1120
1121 /* This HALT will be patched up at FB write time to point UIP at the end of
1122 * the program, and at brw_uip_jip() JIP will be set to the end of the
1123 * current block (or the program).
1124 */
1125 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1126
1127 brw_push_insn_state(p);
1128 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1129 gen6_HALT(p);
1130 brw_pop_insn_state(p);
1131 }
1132
1133 void
1134 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1135 {
1136 assert(inst->mlen != 0);
1137
1138 brw_MOV(p,
1139 brw_uvec_mrf(inst->exec_size, (inst->base_mrf + 1), 0),
1140 retype(src, BRW_REGISTER_TYPE_UD));
1141 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1142 inst->exec_size / 8, inst->offset);
1143 }
1144
1145 void
1146 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1147 {
1148 assert(inst->mlen != 0);
1149
1150 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1151 inst->exec_size / 8, inst->offset);
1152 }
1153
1154 void
1155 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1156 {
1157 gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1158 }
1159
1160 void
1161 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1162 struct brw_reg dst,
1163 struct brw_reg index,
1164 struct brw_reg offset)
1165 {
1166 assert(inst->mlen != 0);
1167
1168 assert(index.file == BRW_IMMEDIATE_VALUE &&
1169 index.type == BRW_REGISTER_TYPE_UD);
1170 uint32_t surf_index = index.dw1.ud;
1171
1172 assert(offset.file == BRW_IMMEDIATE_VALUE &&
1173 offset.type == BRW_REGISTER_TYPE_UD);
1174 uint32_t read_offset = offset.dw1.ud;
1175
1176 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1177 read_offset, surf_index);
1178
1179 brw_mark_surface_used(prog_data, surf_index);
1180 }
1181
1182 void
1183 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1184 struct brw_reg dst,
1185 struct brw_reg index,
1186 struct brw_reg offset)
1187 {
1188 assert(index.type == BRW_REGISTER_TYPE_UD);
1189
1190 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
1191 /* Reference just the dword we need, to avoid angering validate_reg(). */
1192 offset = brw_vec1_grf(offset.nr, 0);
1193
1194 /* We use the SIMD4x2 mode because we want to end up with 4 components in
1195 * the destination loaded consecutively from the same offset (which appears
1196 * in the first component, and the rest are ignored).
1197 */
1198 dst.width = BRW_WIDTH_4;
1199
1200 struct brw_reg src = offset;
1201 bool header_present = false;
1202
1203 if (devinfo->gen >= 9) {
1204 /* Skylake requires a message header in order to use SIMD4x2 mode. */
1205 src = retype(brw_vec4_grf(offset.nr, 0), BRW_REGISTER_TYPE_UD);
1206 header_present = true;
1207
1208 brw_push_insn_state(p);
1209 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1210 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1211 brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1212 brw_set_default_access_mode(p, BRW_ALIGN_1);
1213
1214 brw_MOV(p, get_element_ud(src, 2),
1215 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1216 brw_pop_insn_state(p);
1217 }
1218
1219 if (index.file == BRW_IMMEDIATE_VALUE) {
1220
1221 uint32_t surf_index = index.dw1.ud;
1222
1223 brw_push_insn_state(p);
1224 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1225 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1226 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1227 brw_pop_insn_state(p);
1228
1229 brw_set_dest(p, send, dst);
1230 brw_set_src0(p, send, src);
1231 brw_set_sampler_message(p, send,
1232 surf_index,
1233 0, /* LD message ignores sampler unit */
1234 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1235 1, /* rlen */
1236 inst->mlen,
1237 header_present,
1238 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1239 0);
1240
1241 brw_mark_surface_used(prog_data, surf_index);
1242
1243 } else {
1244
1245 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1246
1247 brw_push_insn_state(p);
1248 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1249 brw_set_default_access_mode(p, BRW_ALIGN_1);
1250
1251 /* a0.0 = surf_index & 0xff */
1252 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1253 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1254 brw_set_dest(p, insn_and, addr);
1255 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1256 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1257
1258 /* dst = send(payload, a0.0 | <descriptor>) */
1259 brw_inst *insn = brw_send_indirect_message(
1260 p, BRW_SFID_SAMPLER, dst, src, addr);
1261 brw_set_sampler_message(p, insn,
1262 0,
1263 0, /* LD message ignores sampler unit */
1264 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1265 1, /* rlen */
1266 inst->mlen,
1267 header_present,
1268 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1269 0);
1270
1271 brw_pop_insn_state(p);
1272
1273 /* visitor knows more than we do about the surface limit required,
1274 * so has already done marking.
1275 */
1276
1277 }
1278 }
1279
1280 void
1281 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
1282 struct brw_reg dst,
1283 struct brw_reg index,
1284 struct brw_reg offset)
1285 {
1286 assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1287 assert(inst->header_size != 0);
1288 assert(inst->mlen);
1289
1290 assert(index.file == BRW_IMMEDIATE_VALUE &&
1291 index.type == BRW_REGISTER_TYPE_UD);
1292 uint32_t surf_index = index.dw1.ud;
1293
1294 uint32_t simd_mode, rlen, msg_type;
1295 if (dispatch_width == 16) {
1296 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1297 rlen = 8;
1298 } else {
1299 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1300 rlen = 4;
1301 }
1302
1303 if (devinfo->gen >= 5)
1304 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1305 else {
1306 /* We always use the SIMD16 message so that we only have to load U, and
1307 * not V or R.
1308 */
1309 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1310 assert(inst->mlen == 3);
1311 assert(inst->regs_written == 8);
1312 rlen = 8;
1313 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1314 }
1315
1316 struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
1317 BRW_REGISTER_TYPE_D);
1318 brw_MOV(p, offset_mrf, offset);
1319
1320 struct brw_reg header = brw_vec8_grf(0, 0);
1321 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1322
1323 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1324 brw_inst_set_qtr_control(p->devinfo, send, BRW_COMPRESSION_NONE);
1325 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1326 brw_set_src0(p, send, header);
1327 if (devinfo->gen < 6)
1328 brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1329
1330 /* Our surface is set up as floats, regardless of what actual data is
1331 * stored in it.
1332 */
1333 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1334 brw_set_sampler_message(p, send,
1335 surf_index,
1336 0, /* sampler (unused) */
1337 msg_type,
1338 rlen,
1339 inst->mlen,
1340 inst->header_size != 0,
1341 simd_mode,
1342 return_format);
1343
1344 brw_mark_surface_used(prog_data, surf_index);
1345 }
1346
1347 void
1348 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1349 struct brw_reg dst,
1350 struct brw_reg index,
1351 struct brw_reg offset)
1352 {
1353 assert(devinfo->gen >= 7);
1354 /* Varying-offset pull constant loads are treated as a normal expression on
1355 * gen7, so the fact that it's a send message is hidden at the IR level.
1356 */
1357 assert(inst->header_size == 0);
1358 assert(!inst->mlen);
1359 assert(index.type == BRW_REGISTER_TYPE_UD);
1360
1361 uint32_t simd_mode, rlen, mlen;
1362 if (dispatch_width == 16) {
1363 mlen = 2;
1364 rlen = 8;
1365 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1366 } else {
1367 mlen = 1;
1368 rlen = 4;
1369 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1370 }
1371
1372 if (index.file == BRW_IMMEDIATE_VALUE) {
1373
1374 uint32_t surf_index = index.dw1.ud;
1375
1376 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1377 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1378 brw_set_src0(p, send, offset);
1379 brw_set_sampler_message(p, send,
1380 surf_index,
1381 0, /* LD message ignores sampler unit */
1382 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1383 rlen,
1384 mlen,
1385 false, /* no header */
1386 simd_mode,
1387 0);
1388
1389 brw_mark_surface_used(prog_data, surf_index);
1390
1391 } else {
1392
1393 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1394
1395 brw_push_insn_state(p);
1396 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1397 brw_set_default_access_mode(p, BRW_ALIGN_1);
1398
1399 /* a0.0 = surf_index & 0xff */
1400 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1401 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1402 brw_set_dest(p, insn_and, addr);
1403 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1404 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1405
1406 brw_pop_insn_state(p);
1407
1408 /* dst = send(offset, a0.0 | <descriptor>) */
1409 brw_inst *insn = brw_send_indirect_message(
1410 p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
1411 offset, addr);
1412 brw_set_sampler_message(p, insn,
1413 0 /* surface */,
1414 0 /* sampler */,
1415 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1416 rlen /* rlen */,
1417 mlen /* mlen */,
1418 false /* header */,
1419 simd_mode,
1420 0);
1421
1422 /* visitor knows more than we do about the surface limit required,
1423 * so has already done marking.
1424 */
1425 }
1426 }
1427
1428 /**
1429 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1430 * into the flags register (f0.0).
1431 *
1432 * Used only on Gen6 and above.
1433 */
1434 void
1435 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1436 {
1437 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1438 struct brw_reg dispatch_mask;
1439
1440 if (devinfo->gen >= 6)
1441 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1442 else
1443 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1444
1445 brw_push_insn_state(p);
1446 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1447 brw_MOV(p, flags, dispatch_mask);
1448 brw_pop_insn_state(p);
1449 }
1450
1451 void
1452 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1453 struct brw_reg dst,
1454 struct brw_reg src,
1455 struct brw_reg msg_data,
1456 unsigned msg_type)
1457 {
1458 assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1459
1460 brw_pixel_interpolator_query(p,
1461 retype(dst, BRW_REGISTER_TYPE_UW),
1462 src,
1463 inst->pi_noperspective,
1464 msg_type,
1465 msg_data,
1466 inst->mlen,
1467 inst->regs_written);
1468 }
1469
1470
1471 /**
1472 * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
1473 * sampler LD messages.
1474 *
1475 * We don't want to bake it into the send message's code generation because
1476 * that means we don't get a chance to schedule the instructions.
1477 */
1478 void
1479 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
1480 struct brw_reg dst,
1481 struct brw_reg value)
1482 {
1483 assert(value.file == BRW_IMMEDIATE_VALUE);
1484
1485 brw_push_insn_state(p);
1486 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1487 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1488 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1489 brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
1490 brw_pop_insn_state(p);
1491 }
1492
1493 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1494 * the ADD instruction.
1495 */
1496 void
1497 fs_generator::generate_set_sample_id(fs_inst *inst,
1498 struct brw_reg dst,
1499 struct brw_reg src0,
1500 struct brw_reg src1)
1501 {
1502 assert(dst.type == BRW_REGISTER_TYPE_D ||
1503 dst.type == BRW_REGISTER_TYPE_UD);
1504 assert(src0.type == BRW_REGISTER_TYPE_D ||
1505 src0.type == BRW_REGISTER_TYPE_UD);
1506
1507 brw_push_insn_state(p);
1508 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1509 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1510 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1511 struct brw_reg reg = stride(src1, 1, 4, 0);
1512 if (dispatch_width == 8) {
1513 brw_ADD(p, dst, src0, reg);
1514 } else if (dispatch_width == 16) {
1515 brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1516 brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1517 }
1518 brw_pop_insn_state(p);
1519 }
1520
1521 void
1522 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1523 struct brw_reg dst,
1524 struct brw_reg x,
1525 struct brw_reg y)
1526 {
1527 assert(devinfo->gen >= 7);
1528 assert(dst.type == BRW_REGISTER_TYPE_UD);
1529 assert(x.type == BRW_REGISTER_TYPE_F);
1530 assert(y.type == BRW_REGISTER_TYPE_F);
1531
1532 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1533 *
1534 * Because this instruction does not have a 16-bit floating-point type,
1535 * the destination data type must be Word (W).
1536 *
1537 * The destination must be DWord-aligned and specify a horizontal stride
1538 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
1539 * each destination channel and the upper word is not modified.
1540 */
1541 struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1542
1543 /* Give each 32-bit channel of dst the form below, where "." means
1544 * unchanged.
1545 * 0x....hhhh
1546 */
1547 brw_F32TO16(p, dst_w, y);
1548
1549 /* Now the form:
1550 * 0xhhhh0000
1551 */
1552 brw_SHL(p, dst, dst, brw_imm_ud(16u));
1553
1554 /* And, finally the form of packHalf2x16's output:
1555 * 0xhhhhllll
1556 */
1557 brw_F32TO16(p, dst_w, x);
1558 }
1559
1560 void
1561 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1562 struct brw_reg dst,
1563 struct brw_reg src)
1564 {
1565 assert(devinfo->gen >= 7);
1566 assert(dst.type == BRW_REGISTER_TYPE_F);
1567 assert(src.type == BRW_REGISTER_TYPE_UD);
1568
1569 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1570 *
1571 * Because this instruction does not have a 16-bit floating-point type,
1572 * the source data type must be Word (W). The destination type must be
1573 * F (Float).
1574 */
1575 struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1576
1577 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1578 * For the Y case, we wish to access only the upper word; therefore
1579 * a 16-bit subregister offset is needed.
1580 */
1581 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1582 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1583 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1584 src_w.subnr += 2;
1585
1586 brw_F16TO32(p, dst, src_w);
1587 }
1588
1589 void
1590 fs_generator::generate_shader_time_add(fs_inst *inst,
1591 struct brw_reg payload,
1592 struct brw_reg offset,
1593 struct brw_reg value)
1594 {
1595 assert(devinfo->gen >= 7);
1596 brw_push_insn_state(p);
1597 brw_set_default_mask_control(p, true);
1598
1599 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1600 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1601 offset.type);
1602 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1603 value.type);
1604
1605 assert(offset.file == BRW_IMMEDIATE_VALUE);
1606 if (value.file == BRW_GENERAL_REGISTER_FILE) {
1607 value.width = BRW_WIDTH_1;
1608 value.hstride = BRW_HORIZONTAL_STRIDE_0;
1609 value.vstride = BRW_VERTICAL_STRIDE_0;
1610 } else {
1611 assert(value.file == BRW_IMMEDIATE_VALUE);
1612 }
1613
1614 /* Trying to deal with setup of the params from the IR is crazy in the FS8
1615 * case, and we don't really care about squeezing every bit of performance
1616 * out of this path, so we just emit the MOVs from here.
1617 */
1618 brw_MOV(p, payload_offset, offset);
1619 brw_MOV(p, payload_value, value);
1620 brw_shader_time_add(p, payload,
1621 prog_data->binding_table.shader_time_start);
1622 brw_pop_insn_state(p);
1623
1624 brw_mark_surface_used(prog_data,
1625 prog_data->binding_table.shader_time_start);
1626 }
1627
1628 void
1629 fs_generator::enable_debug(const char *shader_name)
1630 {
1631 debug_flag = true;
1632 this->shader_name = shader_name;
1633 }
1634
1635 int
1636 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1637 {
1638 /* align to 64 byte boundary. */
1639 while (p->next_insn_offset % 64)
1640 brw_NOP(p);
1641
1642 this->dispatch_width = dispatch_width;
1643 if (dispatch_width == 16)
1644 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1645
1646 int start_offset = p->next_insn_offset;
1647 int spill_count = 0, fill_count = 0;
1648 int loop_count = 0;
1649
1650 struct annotation_info annotation;
1651 memset(&annotation, 0, sizeof(annotation));
1652
1653 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1654 struct brw_reg src[3], dst;
1655 unsigned int last_insn_offset = p->next_insn_offset;
1656 bool multiple_instructions_emitted = false;
1657
1658 if (unlikely(debug_flag))
1659 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
1660
1661 for (unsigned int i = 0; i < inst->sources; i++) {
1662 src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen);
1663
1664 /* The accumulator result appears to get used for the
1665 * conditional modifier generation. When negating a UD
1666 * value, there is a 33rd bit generated for the sign in the
1667 * accumulator value, so now you can't check, for example,
1668 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
1669 */
1670 assert(!inst->conditional_mod ||
1671 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1672 !inst->src[i].negate);
1673 }
1674 dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen);
1675
1676 brw_set_default_predicate_control(p, inst->predicate);
1677 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1678 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1679 brw_set_default_saturate(p, inst->saturate);
1680 brw_set_default_mask_control(p, inst->force_writemask_all);
1681 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1682 brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
1683
1684 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1685 assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1686
1687 switch (inst->exec_size) {
1688 case 1:
1689 case 2:
1690 case 4:
1691 assert(inst->force_writemask_all);
1692 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1693 break;
1694 case 8:
1695 if (inst->force_sechalf) {
1696 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1697 } else {
1698 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1699 }
1700 break;
1701 case 16:
1702 case 32:
1703 /* If the instruction writes to more than one register, it needs to
1704 * be a "compressed" instruction on Gen <= 5.
1705 */
1706 if (inst->dst.component_size(inst->exec_size) > REG_SIZE)
1707 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1708 else
1709 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1710 break;
1711 default:
1712 unreachable("Invalid instruction width");
1713 }
1714
1715 switch (inst->opcode) {
1716 case BRW_OPCODE_MOV:
1717 brw_MOV(p, dst, src[0]);
1718 break;
1719 case BRW_OPCODE_ADD:
1720 brw_ADD(p, dst, src[0], src[1]);
1721 break;
1722 case BRW_OPCODE_MUL:
1723 brw_MUL(p, dst, src[0], src[1]);
1724 break;
1725 case BRW_OPCODE_AVG:
1726 brw_AVG(p, dst, src[0], src[1]);
1727 break;
1728 case BRW_OPCODE_MACH:
1729 brw_MACH(p, dst, src[0], src[1]);
1730 break;
1731
1732 case BRW_OPCODE_LINE:
1733 brw_LINE(p, dst, src[0], src[1]);
1734 break;
1735
1736 case BRW_OPCODE_MAD:
1737 assert(devinfo->gen >= 6);
1738 brw_set_default_access_mode(p, BRW_ALIGN_16);
1739 if (dispatch_width == 16 && !devinfo->supports_simd16_3src) {
1740 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1741 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1742 brw_inst *f = brw_MAD(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1743 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1744 brw_inst *s = brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1745 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1746
1747 if (inst->conditional_mod) {
1748 brw_inst_set_cond_modifier(p->devinfo, f, inst->conditional_mod);
1749 brw_inst_set_cond_modifier(p->devinfo, s, inst->conditional_mod);
1750 multiple_instructions_emitted = true;
1751 }
1752 } else {
1753 brw_MAD(p, dst, src[0], src[1], src[2]);
1754 }
1755 brw_set_default_access_mode(p, BRW_ALIGN_1);
1756 break;
1757
1758 case BRW_OPCODE_LRP:
1759 assert(devinfo->gen >= 6);
1760 brw_set_default_access_mode(p, BRW_ALIGN_16);
1761 if (dispatch_width == 16 && !devinfo->supports_simd16_3src) {
1762 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1763 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1764 brw_inst *f = brw_LRP(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1765 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1766 brw_inst *s = brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1767 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1768
1769 if (inst->conditional_mod) {
1770 brw_inst_set_cond_modifier(p->devinfo, f, inst->conditional_mod);
1771 brw_inst_set_cond_modifier(p->devinfo, s, inst->conditional_mod);
1772 multiple_instructions_emitted = true;
1773 }
1774 } else {
1775 brw_LRP(p, dst, src[0], src[1], src[2]);
1776 }
1777 brw_set_default_access_mode(p, BRW_ALIGN_1);
1778 break;
1779
1780 case BRW_OPCODE_FRC:
1781 brw_FRC(p, dst, src[0]);
1782 break;
1783 case BRW_OPCODE_RNDD:
1784 brw_RNDD(p, dst, src[0]);
1785 break;
1786 case BRW_OPCODE_RNDE:
1787 brw_RNDE(p, dst, src[0]);
1788 break;
1789 case BRW_OPCODE_RNDZ:
1790 brw_RNDZ(p, dst, src[0]);
1791 break;
1792
1793 case BRW_OPCODE_AND:
1794 brw_AND(p, dst, src[0], src[1]);
1795 break;
1796 case BRW_OPCODE_OR:
1797 brw_OR(p, dst, src[0], src[1]);
1798 break;
1799 case BRW_OPCODE_XOR:
1800 brw_XOR(p, dst, src[0], src[1]);
1801 break;
1802 case BRW_OPCODE_NOT:
1803 brw_NOT(p, dst, src[0]);
1804 break;
1805 case BRW_OPCODE_ASR:
1806 brw_ASR(p, dst, src[0], src[1]);
1807 break;
1808 case BRW_OPCODE_SHR:
1809 brw_SHR(p, dst, src[0], src[1]);
1810 break;
1811 case BRW_OPCODE_SHL:
1812 brw_SHL(p, dst, src[0], src[1]);
1813 break;
1814 case BRW_OPCODE_F32TO16:
1815 assert(devinfo->gen >= 7);
1816 brw_F32TO16(p, dst, src[0]);
1817 break;
1818 case BRW_OPCODE_F16TO32:
1819 assert(devinfo->gen >= 7);
1820 brw_F16TO32(p, dst, src[0]);
1821 break;
1822 case BRW_OPCODE_CMP:
1823 /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says
1824 * that when the destination is a GRF that the dependency-clear bit on
1825 * the flag register is cleared early.
1826 *
1827 * Suggested workarounds are to disable coissuing CMP instructions
1828 * or to split CMP(16) instructions into two CMP(8) instructions.
1829 *
1830 * We choose to split into CMP(8) instructions since disabling
1831 * coissuing would affect CMP instructions not otherwise affected by
1832 * the errata.
1833 */
1834 if (dispatch_width == 16 && devinfo->gen == 7 && !devinfo->is_haswell) {
1835 if (dst.file == BRW_GENERAL_REGISTER_FILE) {
1836 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1837 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1838 brw_CMP(p, firsthalf(dst), inst->conditional_mod,
1839 firsthalf(src[0]), firsthalf(src[1]));
1840 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1841 brw_CMP(p, sechalf(dst), inst->conditional_mod,
1842 sechalf(src[0]), sechalf(src[1]));
1843 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1844
1845 multiple_instructions_emitted = true;
1846 } else if (dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1847 /* For unknown reasons, the aforementioned workaround is not
1848 * sufficient. Overriding the type when the destination is the
1849 * null register is necessary but not sufficient by itself.
1850 */
1851 assert(dst.nr == BRW_ARF_NULL);
1852 dst.type = BRW_REGISTER_TYPE_D;
1853 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1854 } else {
1855 unreachable("not reached");
1856 }
1857 } else {
1858 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1859 }
1860 break;
1861 case BRW_OPCODE_SEL:
1862 brw_SEL(p, dst, src[0], src[1]);
1863 break;
1864 case BRW_OPCODE_BFREV:
1865 assert(devinfo->gen >= 7);
1866 /* BFREV only supports UD type for src and dst. */
1867 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1868 retype(src[0], BRW_REGISTER_TYPE_UD));
1869 break;
1870 case BRW_OPCODE_FBH:
1871 assert(devinfo->gen >= 7);
1872 /* FBH only supports UD type for dst. */
1873 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1874 break;
1875 case BRW_OPCODE_FBL:
1876 assert(devinfo->gen >= 7);
1877 /* FBL only supports UD type for dst. */
1878 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1879 break;
1880 case BRW_OPCODE_CBIT:
1881 assert(devinfo->gen >= 7);
1882 /* CBIT only supports UD type for dst. */
1883 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1884 break;
1885 case BRW_OPCODE_ADDC:
1886 assert(devinfo->gen >= 7);
1887 brw_ADDC(p, dst, src[0], src[1]);
1888 break;
1889 case BRW_OPCODE_SUBB:
1890 assert(devinfo->gen >= 7);
1891 brw_SUBB(p, dst, src[0], src[1]);
1892 break;
1893 case BRW_OPCODE_MAC:
1894 brw_MAC(p, dst, src[0], src[1]);
1895 break;
1896
1897 case BRW_OPCODE_BFE:
1898 assert(devinfo->gen >= 7);
1899 brw_set_default_access_mode(p, BRW_ALIGN_16);
1900 if (dispatch_width == 16 && !devinfo->supports_simd16_3src) {
1901 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1902 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1903 brw_BFE(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1904 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1905 brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1906 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1907 } else {
1908 brw_BFE(p, dst, src[0], src[1], src[2]);
1909 }
1910 brw_set_default_access_mode(p, BRW_ALIGN_1);
1911 break;
1912
1913 case BRW_OPCODE_BFI1:
1914 assert(devinfo->gen >= 7);
1915 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1916 * should
1917 *
1918 * "Force BFI instructions to be executed always in SIMD8."
1919 */
1920 if (dispatch_width == 16 && devinfo->is_haswell) {
1921 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1922 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1923 brw_BFI1(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]));
1924 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1925 brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]));
1926 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1927 } else {
1928 brw_BFI1(p, dst, src[0], src[1]);
1929 }
1930 break;
1931 case BRW_OPCODE_BFI2:
1932 assert(devinfo->gen >= 7);
1933 brw_set_default_access_mode(p, BRW_ALIGN_16);
1934 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1935 * should
1936 *
1937 * "Force BFI instructions to be executed always in SIMD8."
1938 *
1939 * Otherwise we would be able to emit compressed instructions like we
1940 * do for the other three-source instructions.
1941 */
1942 if (dispatch_width == 16 &&
1943 (devinfo->is_haswell || !devinfo->supports_simd16_3src)) {
1944 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1945 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1946 brw_BFI2(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1947 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1948 brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1949 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1950 } else {
1951 brw_BFI2(p, dst, src[0], src[1], src[2]);
1952 }
1953 brw_set_default_access_mode(p, BRW_ALIGN_1);
1954 break;
1955
1956 case BRW_OPCODE_IF:
1957 if (inst->src[0].file != BAD_FILE) {
1958 /* The instruction has an embedded compare (only allowed on gen6) */
1959 assert(devinfo->gen == 6);
1960 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1961 } else {
1962 brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1963 }
1964 break;
1965
1966 case BRW_OPCODE_ELSE:
1967 brw_ELSE(p);
1968 break;
1969 case BRW_OPCODE_ENDIF:
1970 brw_ENDIF(p);
1971 break;
1972
1973 case BRW_OPCODE_DO:
1974 brw_DO(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1975 break;
1976
1977 case BRW_OPCODE_BREAK:
1978 brw_BREAK(p);
1979 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1980 break;
1981 case BRW_OPCODE_CONTINUE:
1982 brw_CONT(p);
1983 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1984 break;
1985
1986 case BRW_OPCODE_WHILE:
1987 brw_WHILE(p);
1988 loop_count++;
1989 break;
1990
1991 case SHADER_OPCODE_RCP:
1992 case SHADER_OPCODE_RSQ:
1993 case SHADER_OPCODE_SQRT:
1994 case SHADER_OPCODE_EXP2:
1995 case SHADER_OPCODE_LOG2:
1996 case SHADER_OPCODE_SIN:
1997 case SHADER_OPCODE_COS:
1998 assert(devinfo->gen < 6 || inst->mlen == 0);
1999 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2000 if (devinfo->gen >= 7) {
2001 gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
2002 brw_null_reg());
2003 } else if (devinfo->gen == 6) {
2004 generate_math_gen6(inst, dst, src[0], brw_null_reg());
2005 } else if (devinfo->gen == 5 || devinfo->is_g4x) {
2006 generate_math_g45(inst, dst, src[0]);
2007 } else {
2008 generate_math_gen4(inst, dst, src[0]);
2009 }
2010 break;
2011 case SHADER_OPCODE_INT_QUOTIENT:
2012 case SHADER_OPCODE_INT_REMAINDER:
2013 case SHADER_OPCODE_POW:
2014 assert(devinfo->gen < 6 || inst->mlen == 0);
2015 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2016 if (devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) {
2017 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
2018 } else if (devinfo->gen >= 6) {
2019 generate_math_gen6(inst, dst, src[0], src[1]);
2020 } else {
2021 generate_math_gen4(inst, dst, src[0]);
2022 }
2023 break;
2024 case FS_OPCODE_CINTERP:
2025 brw_MOV(p, dst, src[0]);
2026 break;
2027 case FS_OPCODE_LINTERP:
2028 generate_linterp(inst, dst, src);
2029 break;
2030 case FS_OPCODE_PIXEL_X:
2031 assert(src[0].type == BRW_REGISTER_TYPE_UW);
2032 src[0].subnr = 0 * type_sz(src[0].type);
2033 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2034 break;
2035 case FS_OPCODE_PIXEL_Y:
2036 assert(src[0].type == BRW_REGISTER_TYPE_UW);
2037 src[0].subnr = 4 * type_sz(src[0].type);
2038 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2039 break;
2040 case FS_OPCODE_GET_BUFFER_SIZE:
2041 generate_get_buffer_size(inst, dst, src[0], src[1]);
2042 break;
2043 case SHADER_OPCODE_TEX:
2044 case FS_OPCODE_TXB:
2045 case SHADER_OPCODE_TXD:
2046 case SHADER_OPCODE_TXF:
2047 case SHADER_OPCODE_TXF_CMS:
2048 case SHADER_OPCODE_TXF_UMS:
2049 case SHADER_OPCODE_TXF_MCS:
2050 case SHADER_OPCODE_TXL:
2051 case SHADER_OPCODE_TXS:
2052 case SHADER_OPCODE_LOD:
2053 case SHADER_OPCODE_TG4:
2054 case SHADER_OPCODE_TG4_OFFSET:
2055 case SHADER_OPCODE_SAMPLEINFO:
2056 generate_tex(inst, dst, src[0], src[1]);
2057 break;
2058 case FS_OPCODE_DDX_COARSE:
2059 case FS_OPCODE_DDX_FINE:
2060 generate_ddx(inst->opcode, dst, src[0]);
2061 break;
2062 case FS_OPCODE_DDY_COARSE:
2063 case FS_OPCODE_DDY_FINE:
2064 assert(src[1].file == BRW_IMMEDIATE_VALUE);
2065 generate_ddy(inst->opcode, dst, src[0], src[1].dw1.ud);
2066 break;
2067
2068 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
2069 generate_scratch_write(inst, src[0]);
2070 spill_count++;
2071 break;
2072
2073 case SHADER_OPCODE_GEN4_SCRATCH_READ:
2074 generate_scratch_read(inst, dst);
2075 fill_count++;
2076 break;
2077
2078 case SHADER_OPCODE_GEN7_SCRATCH_READ:
2079 generate_scratch_read_gen7(inst, dst);
2080 fill_count++;
2081 break;
2082
2083 case SHADER_OPCODE_URB_READ_SIMD8:
2084 generate_urb_read(inst, dst, src[0]);
2085 break;
2086
2087 case SHADER_OPCODE_URB_WRITE_SIMD8:
2088 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
2089 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
2090 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
2091 generate_urb_write(inst, src[0]);
2092 break;
2093
2094 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2095 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
2096 break;
2097
2098 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2099 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2100 break;
2101
2102 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
2103 generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
2104 break;
2105
2106 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
2107 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2108 break;
2109
2110 case FS_OPCODE_REP_FB_WRITE:
2111 case FS_OPCODE_FB_WRITE:
2112 generate_fb_write(inst, src[0]);
2113 break;
2114
2115 case FS_OPCODE_BLORP_FB_WRITE:
2116 generate_blorp_fb_write(inst);
2117 break;
2118
2119 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
2120 generate_mov_dispatch_to_flags(inst);
2121 break;
2122
2123 case FS_OPCODE_DISCARD_JUMP:
2124 generate_discard_jump(inst);
2125 break;
2126
2127 case SHADER_OPCODE_SHADER_TIME_ADD:
2128 generate_shader_time_add(inst, src[0], src[1], src[2]);
2129 break;
2130
2131 case SHADER_OPCODE_UNTYPED_ATOMIC:
2132 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2133 brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud,
2134 inst->mlen, !inst->dst.is_null());
2135 break;
2136
2137 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
2138 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2139 brw_untyped_surface_read(p, dst, src[0], src[1],
2140 inst->mlen, src[2].dw1.ud);
2141 break;
2142
2143 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
2144 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2145 brw_untyped_surface_write(p, src[0], src[1],
2146 inst->mlen, src[2].dw1.ud);
2147 break;
2148
2149 case SHADER_OPCODE_TYPED_ATOMIC:
2150 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2151 brw_typed_atomic(p, dst, src[0], src[1],
2152 src[2].dw1.ud, inst->mlen, !inst->dst.is_null());
2153 break;
2154
2155 case SHADER_OPCODE_TYPED_SURFACE_READ:
2156 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2157 brw_typed_surface_read(p, dst, src[0], src[1],
2158 inst->mlen, src[2].dw1.ud);
2159 break;
2160
2161 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
2162 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2163 brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].dw1.ud);
2164 break;
2165
2166 case SHADER_OPCODE_MEMORY_FENCE:
2167 brw_memory_fence(p, dst);
2168 break;
2169
2170 case FS_OPCODE_SET_SIMD4X2_OFFSET:
2171 generate_set_simd4x2_offset(inst, dst, src[0]);
2172 break;
2173
2174 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2175 brw_find_live_channel(p, dst);
2176 break;
2177
2178 case SHADER_OPCODE_BROADCAST:
2179 brw_broadcast(p, dst, src[0], src[1]);
2180 break;
2181
2182 case FS_OPCODE_SET_SAMPLE_ID:
2183 generate_set_sample_id(inst, dst, src[0], src[1]);
2184 break;
2185
2186 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2187 generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2188 break;
2189
2190 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2191 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2192 generate_unpack_half_2x16_split(inst, dst, src[0]);
2193 break;
2194
2195 case FS_OPCODE_PLACEHOLDER_HALT:
2196 /* This is the place where the final HALT needs to be inserted if
2197 * we've emitted any discards. If not, this will emit no code.
2198 */
2199 if (!patch_discard_jumps_to_fb_writes()) {
2200 if (unlikely(debug_flag)) {
2201 annotation.ann_count--;
2202 }
2203 }
2204 break;
2205
2206 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
2207 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2208 GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID);
2209 break;
2210
2211 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2212 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2213 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2214 break;
2215
2216 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2217 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2218 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2219 break;
2220
2221 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2222 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2223 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2224 break;
2225
2226 case CS_OPCODE_CS_TERMINATE:
2227 generate_cs_terminate(inst, src[0]);
2228 break;
2229
2230 case SHADER_OPCODE_BARRIER:
2231 generate_barrier(inst, src[0]);
2232 break;
2233
2234 case FS_OPCODE_PACK_STENCIL_REF:
2235 generate_stencil_ref_packing(inst, dst, src[0]);
2236 break;
2237
2238 default:
2239 unreachable("Unsupported opcode");
2240
2241 case SHADER_OPCODE_LOAD_PAYLOAD:
2242 unreachable("Should be lowered by lower_load_payload()");
2243 }
2244
2245 if (multiple_instructions_emitted)
2246 continue;
2247
2248 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2249 assert(p->next_insn_offset == last_insn_offset + 16 ||
2250 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2251 "emitting more than 1 instruction");
2252
2253 brw_inst *last = &p->store[last_insn_offset / 16];
2254
2255 if (inst->conditional_mod)
2256 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2257 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2258 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2259 }
2260 }
2261
2262 brw_set_uip_jip(p);
2263 annotation_finalize(&annotation, p->next_insn_offset);
2264
2265 int before_size = p->next_insn_offset - start_offset;
2266 brw_compact_instructions(p, start_offset, annotation.ann_count,
2267 annotation.ann);
2268 int after_size = p->next_insn_offset - start_offset;
2269
2270 if (unlikely(debug_flag)) {
2271 fprintf(stderr, "Native code for %s\n"
2272 "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
2273 " bytes (%.0f%%)\n",
2274 shader_name, dispatch_width, before_size / 16, loop_count,
2275 spill_count, fill_count, promoted_constants, before_size, after_size,
2276 100.0f * (before_size - after_size) / before_size);
2277
2278 dump_assembly(p->store, annotation.ann_count, annotation.ann,
2279 p->devinfo);
2280 ralloc_free(annotation.ann);
2281 }
2282
2283 compiler->shader_debug_log(log_data,
2284 "%s SIMD%d shader: %d inst, %d loops, "
2285 "%d:%d spills:fills, Promoted %u constants, "
2286 "compacted %d to %d bytes.\n",
2287 stage_abbrev, dispatch_width, before_size / 16,
2288 loop_count, spill_count, fill_count,
2289 promoted_constants, before_size, after_size);
2290
2291 return start_offset;
2292 }
2293
2294 const unsigned *
2295 fs_generator::get_assembly(unsigned int *assembly_size)
2296 {
2297 return brw_get_program(p, assembly_size);
2298 }