d0987362a4e431e838d9e04a185e380dd434c742
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 for (i = 0; i < fp->Base.NumInstructions; i++) {
27 const struct prog_instruction *inst = &fp->Base.Instructions[i];
28 switch (inst->Opcode) {
29 case OPCODE_ARL:
30 case OPCODE_IF:
31 case OPCODE_ENDIF:
32 case OPCODE_CAL:
33 case OPCODE_BRK:
34 case OPCODE_RET:
35 case OPCODE_NOISE1:
36 case OPCODE_NOISE2:
37 case OPCODE_NOISE3:
38 case OPCODE_NOISE4:
39 case OPCODE_BGNLOOP:
40 return GL_TRUE;
41 default:
42 break;
43 }
44 }
45 return GL_FALSE;
46 }
47
48
49
50 static void
51 reclaim_temps(struct brw_wm_compile *c);
52
53
54 /** Mark GRF register as used. */
55 static void
56 prealloc_grf(struct brw_wm_compile *c, int r)
57 {
58 c->used_grf[r] = GL_TRUE;
59 }
60
61
62 /** Mark given GRF register as not in use. */
63 static void
64 release_grf(struct brw_wm_compile *c, int r)
65 {
66 /*assert(c->used_grf[r]);*/
67 c->used_grf[r] = GL_FALSE;
68 c->first_free_grf = MIN2(c->first_free_grf, r);
69 }
70
71
72 /** Return index of a free GRF, mark it as used. */
73 static int
74 alloc_grf(struct brw_wm_compile *c)
75 {
76 GLuint r;
77 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
78 if (!c->used_grf[r]) {
79 c->used_grf[r] = GL_TRUE;
80 c->first_free_grf = r + 1; /* a guess */
81 return r;
82 }
83 }
84
85 /* no free temps, try to reclaim some */
86 reclaim_temps(c);
87 c->first_free_grf = 0;
88
89 /* try alloc again */
90 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
91 if (!c->used_grf[r]) {
92 c->used_grf[r] = GL_TRUE;
93 c->first_free_grf = r + 1; /* a guess */
94 return r;
95 }
96 }
97
98 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
99 assert(c->used_grf[r]);
100 }
101
102 /* really, no free GRF regs found */
103 if (!c->out_of_regs) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
106 c->out_of_regs = GL_TRUE;
107 }
108
109 return -1;
110 }
111
112
113 /** Return number of GRF registers used */
114 static int
115 num_grf_used(const struct brw_wm_compile *c)
116 {
117 int r;
118 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
119 if (c->used_grf[r])
120 return r + 1;
121 return 0;
122 }
123
124
125
126 /**
127 * Record the mapping of a Mesa register to a hardware register.
128 */
129 static void set_reg(struct brw_wm_compile *c, int file, int index,
130 int component, struct brw_reg reg)
131 {
132 c->wm_regs[file][index][component].reg = reg;
133 c->wm_regs[file][index][component].inited = GL_TRUE;
134 }
135
136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
137 {
138 struct brw_reg reg;
139
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c->tmp_index == c->tmp_max) {
142 int r = alloc_grf(c);
143 if (r < 0) {
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r = 50; /* XXX random register! */
146 }
147 c->tmp_regs[ c->tmp_max++ ] = r;
148 }
149
150 /* form the GRF register */
151 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg.nr < BRW_WM_MAX_GRF);
154 return reg;
155
156 }
157
158 /**
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
161 */
162 static int mark_tmps(struct brw_wm_compile *c)
163 {
164 return c->tmp_index;
165 }
166
167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
168 {
169 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
170 }
171
172 static void release_tmps(struct brw_wm_compile *c, int mark)
173 {
174 c->tmp_index = mark;
175 }
176
177 /**
178 * Convert Mesa src register to brw register.
179 *
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
182 *
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
189 */
190 static struct brw_reg
191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
192 int nr, GLuint neg, GLuint abs)
193 {
194 struct brw_reg reg;
195 switch (file) {
196 case PROGRAM_STATE_VAR:
197 case PROGRAM_CONSTANT:
198 case PROGRAM_UNIFORM:
199 file = PROGRAM_STATE_VAR;
200 break;
201 case PROGRAM_UNDEFINED:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY:
204 case PROGRAM_INPUT:
205 case PROGRAM_OUTPUT:
206 case PROGRAM_PAYLOAD:
207 break;
208 default:
209 _mesa_problem(NULL, "Unexpected file in get_reg()");
210 return brw_null_reg();
211 }
212
213 assert(index < 256);
214 assert(component < 4);
215
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c->wm_regs[file][index][component].inited) {
218 /* yes, re-use */
219 reg = c->wm_regs[file][index][component].reg;
220 }
221 else {
222 /* no, allocate new register */
223 int grf = alloc_grf(c);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
225 if (grf < 0) {
226 /* totally out of temps */
227 grf = 51; /* XXX random register! */
228 }
229
230 reg = brw_vec8_grf(grf, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
232
233 set_reg(c, file, index, component, reg);
234 }
235
236 if (neg & (1 << component)) {
237 reg = negate(reg);
238 }
239 if (abs)
240 reg = brw_abs(reg);
241 return reg;
242 }
243
244
245
246 /**
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
249 */
250 static void
251 reclaim_temps(struct brw_wm_compile *c)
252 {
253 GLint intBegin[MAX_PROGRAM_TEMPS];
254 GLint intEnd[MAX_PROGRAM_TEMPS];
255 int index;
256
257 /*printf("Reclaim temps:\n");*/
258
259 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
260 intBegin, intEnd);
261
262 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
263 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
264 /* program temp[i] can be freed */
265 int component;
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component = 0; component < 4; component++) {
268 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
269 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
270 release_grf(c, r);
271 /*
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
274 */
275 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
276 }
277 }
278 }
279 }
280 }
281
282
283
284
285 /**
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
288 * and shader inputs.
289 */
290 static void prealloc_reg(struct brw_wm_compile *c)
291 {
292 int i, j;
293 struct brw_reg reg;
294 int urb_read_length = 0;
295 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
296 GLuint reg_index = 0;
297
298 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
299 c->first_free_grf = 0;
300
301 for (i = 0; i < 4; i++) {
302 if (i < c->key.nr_depth_regs)
303 reg = brw_vec8_grf(i * 2, 0);
304 else
305 reg = brw_vec8_grf(0, 0);
306 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
307 }
308 reg_index += 2 * c->key.nr_depth_regs;
309
310 /* constants */
311 {
312 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
313 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
314
315 /* use a real constant buffer, or just use a section of the GRF? */
316 /* XXX this heuristic may need adjustment... */
317 if ((nr_params + nr_temps) * 4 + reg_index > 80)
318 c->fp->use_const_buffer = GL_TRUE;
319 else
320 c->fp->use_const_buffer = GL_FALSE;
321 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
322
323 if (c->fp->use_const_buffer) {
324 /* We'll use a real constant buffer and fetch constants from
325 * it with a dataport read message.
326 */
327
328 /* number of float constants in CURBE */
329 c->prog_data.nr_params = 0;
330 }
331 else {
332 const struct gl_program_parameter_list *plist =
333 c->fp->program.Base.Parameters;
334 int index = 0;
335
336 /* number of float constants in CURBE */
337 c->prog_data.nr_params = 4 * nr_params;
338
339 /* loop over program constants (float[4]) */
340 for (i = 0; i < nr_params; i++) {
341 /* loop over XYZW channels */
342 for (j = 0; j < 4; j++, index++) {
343 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
344 /* Save pointer to parameter/constant value.
345 * Constants will be copied in prepare_constant_buffer()
346 */
347 c->prog_data.param[index] = &plist->ParameterValues[i][j];
348 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
349 }
350 }
351 /* number of constant regs used (each reg is float[8]) */
352 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
353 reg_index += c->nr_creg;
354 }
355 }
356
357 /* fragment shader inputs */
358 for (i = 0; i < VERT_RESULT_MAX; i++) {
359 int fp_input;
360
361 if (i >= VERT_RESULT_VAR0)
362 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
363 else if (i <= VERT_RESULT_TEX7)
364 fp_input = i;
365 else
366 fp_input = -1;
367
368 if (fp_input >= 0 && inputs & (1 << fp_input)) {
369 urb_read_length = reg_index;
370 reg = brw_vec8_grf(reg_index, 0);
371 for (j = 0; j < 4; j++)
372 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
373 }
374 if (c->key.vp_outputs_written & (1 << i)) {
375 reg_index += 2;
376 }
377 }
378
379 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
380 c->prog_data.urb_read_length = urb_read_length;
381 c->prog_data.curb_read_length = c->nr_creg;
382 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
383 reg_index++;
384 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
385 reg_index += 2;
386
387 /* mark GRF regs [0..reg_index-1] as in-use */
388 for (i = 0; i < reg_index; i++)
389 prealloc_grf(c, i);
390
391 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
392 prealloc_grf(c, 126);
393 prealloc_grf(c, 127);
394
395 for (i = 0; i < c->nr_fp_insns; i++) {
396 const struct prog_instruction *inst = &c->prog_instructions[i];
397 struct brw_reg dst[4];
398
399 switch (inst->Opcode) {
400 case OPCODE_TEX:
401 case OPCODE_TXB:
402 /* Allocate the channels of texture results contiguously,
403 * since they are written out that way by the sampler unit.
404 */
405 for (j = 0; j < 4; j++) {
406 dst[j] = get_dst_reg(c, inst, j);
407 if (j != 0)
408 assert(dst[j].nr == dst[j - 1].nr + 1);
409 }
410 break;
411 default:
412 break;
413 }
414 }
415
416 /* An instruction may reference up to three constants.
417 * They'll be found in these registers.
418 * XXX alloc these on demand!
419 */
420 if (c->fp->use_const_buffer) {
421 for (i = 0; i < 3; i++) {
422 c->current_const[i].index = -1;
423 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
424 }
425 }
426 #if 0
427 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
428 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
429 #endif
430 }
431
432
433 /**
434 * Check if any of the instruction's src registers are constants, uniforms,
435 * or statevars. If so, fetch any constants that we don't already have in
436 * the three GRF slots.
437 */
438 static void fetch_constants(struct brw_wm_compile *c,
439 const struct prog_instruction *inst)
440 {
441 struct brw_compile *p = &c->func;
442 GLuint i;
443
444 /* loop over instruction src regs */
445 for (i = 0; i < 3; i++) {
446 const struct prog_src_register *src = &inst->SrcReg[i];
447 if (src->File == PROGRAM_STATE_VAR ||
448 src->File == PROGRAM_CONSTANT ||
449 src->File == PROGRAM_UNIFORM) {
450 c->current_const[i].index = src->Index;
451
452 #if 0
453 printf(" fetch const[%d] for arg %d into reg %d\n",
454 src->Index, i, c->current_const[i].reg.nr);
455 #endif
456
457 /* need to fetch the constant now */
458 brw_dp_READ_4(p,
459 c->current_const[i].reg, /* writeback dest */
460 src->RelAddr, /* relative indexing? */
461 16 * src->Index, /* byte offset */
462 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
463 );
464 }
465 }
466 }
467
468
469 /**
470 * Convert Mesa dst register to brw register.
471 */
472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
473 const struct prog_instruction *inst,
474 GLuint component)
475 {
476 const int nr = 1;
477 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
478 0, 0);
479 }
480
481
482 static struct brw_reg
483 get_src_reg_const(struct brw_wm_compile *c,
484 const struct prog_instruction *inst,
485 GLuint srcRegIndex, GLuint component)
486 {
487 /* We should have already fetched the constant from the constant
488 * buffer in fetch_constants(). Now we just have to return a
489 * register description that extracts the needed component and
490 * smears it across all eight vector components.
491 */
492 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
493 struct brw_reg const_reg;
494
495 assert(component < 4);
496 assert(srcRegIndex < 3);
497 assert(c->current_const[srcRegIndex].index != -1);
498 const_reg = c->current_const[srcRegIndex].reg;
499
500 /* extract desired float from the const_reg, and smear */
501 const_reg = stride(const_reg, 0, 1, 0);
502 const_reg.subnr = component * 4;
503
504 if (src->Negate & (1 << component))
505 const_reg = negate(const_reg);
506 if (src->Abs)
507 const_reg = brw_abs(const_reg);
508
509 #if 0
510 printf(" form const[%d].%d for arg %d, reg %d\n",
511 c->current_const[srcRegIndex].index,
512 component,
513 srcRegIndex,
514 const_reg.nr);
515 #endif
516
517 return const_reg;
518 }
519
520
521 /**
522 * Convert Mesa src register to brw register.
523 */
524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
525 const struct prog_instruction *inst,
526 GLuint srcRegIndex, GLuint channel)
527 {
528 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
529 const GLuint nr = 1;
530 const GLuint component = GET_SWZ(src->Swizzle, channel);
531
532 /* Extended swizzle terms */
533 if (component == SWIZZLE_ZERO) {
534 return brw_imm_f(0.0F);
535 }
536 else if (component == SWIZZLE_ONE) {
537 return brw_imm_f(1.0F);
538 }
539
540 if (c->fp->use_const_buffer &&
541 (src->File == PROGRAM_STATE_VAR ||
542 src->File == PROGRAM_CONSTANT ||
543 src->File == PROGRAM_UNIFORM)) {
544 return get_src_reg_const(c, inst, srcRegIndex, component);
545 }
546 else {
547 /* other type of source register */
548 return get_reg(c, src->File, src->Index, component, nr,
549 src->Negate, src->Abs);
550 }
551 }
552
553 /**
554 * Subroutines are minimal support for resusable instruction sequences.
555 * They are implemented as simply as possible to minimise overhead: there
556 * is no explicit support for communication between the caller and callee
557 * other than saving the return address in a temporary register, nor is
558 * there any automatic local storage. This implies that great care is
559 * required before attempting reentrancy or any kind of nested
560 * subroutine invocations.
561 */
562 static void invoke_subroutine( struct brw_wm_compile *c,
563 enum _subroutine subroutine,
564 void (*emit)( struct brw_wm_compile * ) )
565 {
566 struct brw_compile *p = &c->func;
567
568 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
569
570 if( c->subroutines[ subroutine ] ) {
571 /* subroutine previously emitted: reuse existing instructions */
572
573 int mark = mark_tmps( c );
574 struct brw_reg return_address = retype( alloc_tmp( c ),
575 BRW_REGISTER_TYPE_UD );
576 int here = p->nr_insn;
577
578 brw_push_insn_state(p);
579 brw_set_mask_control(p, BRW_MASK_DISABLE);
580 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
581
582 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
583 brw_imm_d( ( c->subroutines[ subroutine ] -
584 here - 1 ) << 4 ) );
585 brw_pop_insn_state(p);
586
587 release_tmps( c, mark );
588 } else {
589 /* previously unused subroutine: emit, and mark for later reuse */
590
591 int mark = mark_tmps( c );
592 struct brw_reg return_address = retype( alloc_tmp( c ),
593 BRW_REGISTER_TYPE_UD );
594 struct brw_instruction *calc;
595 int base = p->nr_insn;
596
597 brw_push_insn_state(p);
598 brw_set_mask_control(p, BRW_MASK_DISABLE);
599 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
600 brw_pop_insn_state(p);
601
602 c->subroutines[ subroutine ] = p->nr_insn;
603
604 emit( c );
605
606 brw_push_insn_state(p);
607 brw_set_mask_control(p, BRW_MASK_DISABLE);
608 brw_MOV( p, brw_ip_reg(), return_address );
609 brw_pop_insn_state(p);
610
611 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
612
613 release_tmps( c, mark );
614 }
615 }
616
617 /* Workaround for using brw_wm_emit.c's emit functions, which expect
618 * destination regs to be uniquely written. Moves arguments out to
619 * temporaries as necessary for instructions which use their destination as
620 * a temporary.
621 */
622 static void
623 unalias3(struct brw_wm_compile *c,
624 void (*func)(struct brw_compile *c,
625 const struct brw_reg *dst,
626 GLuint mask,
627 const struct brw_reg *arg0,
628 const struct brw_reg *arg1,
629 const struct brw_reg *arg2),
630 const struct brw_reg *dst,
631 GLuint mask,
632 const struct brw_reg *arg0,
633 const struct brw_reg *arg1,
634 const struct brw_reg *arg2)
635 {
636 struct brw_compile *p = &c->func;
637 struct brw_reg tmp_arg0[4], tmp_arg1[4], tmp_arg2[4];
638 int i, j;
639 int mark = mark_tmps(c);
640
641 for (j = 0; j < 4; j++) {
642 tmp_arg0[j] = arg0[j];
643 tmp_arg1[j] = arg1[j];
644 tmp_arg2[j] = arg2[j];
645 }
646
647 for (i = 0; i < 4; i++) {
648 if (mask & (1<<i)) {
649 for (j = 0; j < 4; j++) {
650 if (arg0[j].file == dst[i].file &&
651 dst[i].nr == arg0[j].nr) {
652 tmp_arg0[j] = alloc_tmp(c);
653 brw_MOV(p, tmp_arg0[j], arg0[j]);
654 }
655 if (arg1[j].file == dst[i].file &&
656 dst[i].nr == arg1[j].nr) {
657 tmp_arg1[j] = alloc_tmp(c);
658 brw_MOV(p, tmp_arg1[j], arg1[j]);
659 }
660 if (arg2[j].file == dst[i].file &&
661 dst[i].nr == arg2[j].nr) {
662 tmp_arg2[j] = alloc_tmp(c);
663 brw_MOV(p, tmp_arg2[j], arg2[j]);
664 }
665 }
666 }
667 }
668
669 func(p, dst, mask, tmp_arg0, tmp_arg1, tmp_arg2);
670
671 release_tmps(c, mark);
672 }
673
674 static void fire_fb_write( struct brw_wm_compile *c,
675 GLuint base_reg,
676 GLuint nr,
677 GLuint target,
678 GLuint eot)
679 {
680 struct brw_compile *p = &c->func;
681 /* Pass through control information:
682 */
683 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
684 {
685 brw_push_insn_state(p);
686 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
687 brw_MOV(p,
688 brw_message_reg(base_reg + 1),
689 brw_vec8_grf(1, 0));
690 brw_pop_insn_state(p);
691 }
692 /* Send framebuffer write message: */
693 brw_fb_WRITE(p,
694 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
695 base_reg,
696 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
697 target,
698 nr,
699 0,
700 eot);
701 }
702
703 static void emit_fb_write(struct brw_wm_compile *c,
704 const struct prog_instruction *inst)
705 {
706 struct brw_compile *p = &c->func;
707 int nr = 2;
708 int channel;
709 GLuint target, eot;
710 struct brw_reg src0;
711
712 /* Reserve a space for AA - may not be needed:
713 */
714 if (c->key.aa_dest_stencil_reg)
715 nr += 1;
716
717 brw_push_insn_state(p);
718 for (channel = 0; channel < 4; channel++) {
719 src0 = get_src_reg(c, inst, 0, channel);
720 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
721 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
722 brw_MOV(p, brw_message_reg(nr + channel), src0);
723 }
724 /* skip over the regs populated above: */
725 nr += 8;
726 brw_pop_insn_state(p);
727
728 if (c->key.source_depth_to_render_target) {
729 if (c->key.computes_depth) {
730 src0 = get_src_reg(c, inst, 2, 2);
731 brw_MOV(p, brw_message_reg(nr), src0);
732 }
733 else {
734 src0 = get_src_reg(c, inst, 1, 1);
735 brw_MOV(p, brw_message_reg(nr), src0);
736 }
737
738 nr += 2;
739 }
740
741 if (c->key.dest_depth_reg) {
742 const GLuint comp = c->key.dest_depth_reg / 2;
743 const GLuint off = c->key.dest_depth_reg % 2;
744
745 if (off != 0) {
746 /* XXX this code needs review/testing */
747 struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
748 struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
749
750 brw_push_insn_state(p);
751 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
752
753 brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
754 /* 2nd half? */
755 brw_MOV(p, brw_message_reg(nr+1), arg1_1);
756 brw_pop_insn_state(p);
757 }
758 else
759 {
760 struct brw_reg src = get_src_reg(c, inst, 1, 1);
761 brw_MOV(p, brw_message_reg(nr), src);
762 }
763 nr += 2;
764 }
765
766 target = INST_AUX_GET_TARGET(inst->Aux);
767 eot = inst->Aux & INST_AUX_EOT;
768 fire_fb_write(c, 0, nr, target, eot);
769 }
770
771 static void emit_arl(struct brw_wm_compile *c,
772 const struct prog_instruction *inst)
773 {
774 struct brw_compile *p = &c->func;
775 struct brw_reg src0, addr_reg;
776 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
777 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
778 BRW_ARF_ADDRESS, 0);
779 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
780 brw_MOV(p, addr_reg, src0);
781 brw_set_saturate(p, 0);
782 }
783
784
785 static void emit_min_max(struct brw_wm_compile *c,
786 const struct prog_instruction *inst)
787 {
788 struct brw_compile *p = &c->func;
789 const GLuint mask = inst->DstReg.WriteMask;
790 const int mark = mark_tmps(c);
791 int i;
792 brw_push_insn_state(p);
793 for (i = 0; i < 4; i++) {
794 if (mask & (1<<i)) {
795 struct brw_reg real_dst = get_dst_reg(c, inst, i);
796 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
797 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
798 struct brw_reg dst;
799 /* if dst==src0 or dst==src1 we need to use a temp reg */
800 GLboolean use_temp = brw_same_reg(dst, src0) ||
801 brw_same_reg(dst, src1);
802 if (use_temp)
803 dst = alloc_tmp(c);
804 else
805 dst = real_dst;
806
807 /*
808 printf(" Min/max: dst %d src0 %d src1 %d\n",
809 dst.nr, src0.nr, src1.nr);
810 */
811 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
812 brw_MOV(p, dst, src0);
813 brw_set_saturate(p, 0);
814
815 if (inst->Opcode == OPCODE_MIN)
816 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
817 else
818 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
819
820 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
821 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
822 brw_MOV(p, dst, src1);
823 brw_set_saturate(p, 0);
824 brw_set_predicate_control_flag_value(p, 0xff);
825 if (use_temp)
826 brw_MOV(p, real_dst, dst);
827 }
828 }
829 brw_pop_insn_state(p);
830 release_tmps(c, mark);
831 }
832
833 /**
834 * For GLSL shaders, this KIL will be unconditional.
835 * It may be contained inside an IF/ENDIF structure of course.
836 */
837 static void emit_kil(struct brw_wm_compile *c)
838 {
839 struct brw_compile *p = &c->func;
840 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
841 brw_push_insn_state(p);
842 brw_set_mask_control(p, BRW_MASK_DISABLE);
843 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
844 brw_AND(p, depth, c->emit_mask_reg, depth);
845 brw_pop_insn_state(p);
846 }
847
848 static INLINE struct brw_reg high_words( struct brw_reg reg )
849 {
850 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
851 0, 8, 2 );
852 }
853
854 static INLINE struct brw_reg low_words( struct brw_reg reg )
855 {
856 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
857 }
858
859 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
860 {
861 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
862 }
863
864 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
865 {
866 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
867 0, 16, 2 );
868 }
869
870 /* One-, two- and three-dimensional Perlin noise, similar to the description
871 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
872 static void noise1_sub( struct brw_wm_compile *c ) {
873
874 struct brw_compile *p = &c->func;
875 struct brw_reg param,
876 x0, x1, /* gradients at each end */
877 t, tmp[ 2 ], /* float temporaries */
878 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
879 int i;
880 int mark = mark_tmps( c );
881
882 x0 = alloc_tmp( c );
883 x1 = alloc_tmp( c );
884 t = alloc_tmp( c );
885 tmp[ 0 ] = alloc_tmp( c );
886 tmp[ 1 ] = alloc_tmp( c );
887 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
888 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
889 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
890 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
891 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
892
893 param = lookup_tmp( c, mark - 2 );
894
895 brw_set_access_mode( p, BRW_ALIGN_1 );
896
897 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
898
899 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
900 be hashed. Also compute the remainder (offset within the unit
901 length), interleaved to reduce register dependency penalties. */
902 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
903 brw_FRC( p, param, param );
904 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
905 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
906 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
907
908 /* We're now ready to perform the hashing. The two hashes are
909 interleaved for performance. The hash function used is
910 designed to rapidly achieve avalanche and require only 32x16
911 bit multiplication, and 16-bit swizzles (which we get for
912 free). We can't use immediate operands in the multiplies,
913 because immediates are permitted only in src1 and the 16-bit
914 factor is permitted only in src0. */
915 for( i = 0; i < 2; i++ )
916 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
917 for( i = 0; i < 2; i++ )
918 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
919 high_words( itmp[ i ] ) );
920 for( i = 0; i < 2; i++ )
921 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
922 for( i = 0; i < 2; i++ )
923 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
924 high_words( itmp[ i ] ) );
925 for( i = 0; i < 2; i++ )
926 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
927 for( i = 0; i < 2; i++ )
928 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
929 high_words( itmp[ i ] ) );
930
931 /* Now we want to initialise the two gradients based on the
932 hashes. Format conversion from signed integer to float leaves
933 everything scaled too high by a factor of pow( 2, 31 ), but
934 we correct for that right at the end. */
935 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
936 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
937 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
938
939 brw_MUL( p, x0, x0, param );
940 brw_MUL( p, x1, x1, t );
941
942 /* We interpolate between the gradients using the polynomial
943 6t^5 - 15t^4 + 10t^3 (Perlin). */
944 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
945 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
946 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
947 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
948 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
949 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
950 pipeline */
951 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
952 brw_MUL( p, param, tmp[ 0 ], param );
953 brw_MUL( p, x1, x1, param );
954 brw_ADD( p, x0, x0, x1 );
955 /* scale by pow( 2, -30 ), to compensate for the format conversion
956 above and an extra factor of 2 so that a single gradient covers
957 the [-1,1] range */
958 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
959
960 release_tmps( c, mark );
961 }
962
963 static void emit_noise1( struct brw_wm_compile *c,
964 const struct prog_instruction *inst )
965 {
966 struct brw_compile *p = &c->func;
967 struct brw_reg src, param, dst;
968 GLuint mask = inst->DstReg.WriteMask;
969 int i;
970 int mark = mark_tmps( c );
971
972 assert( mark == 0 );
973
974 src = get_src_reg( c, inst, 0, 0 );
975
976 param = alloc_tmp( c );
977
978 brw_MOV( p, param, src );
979
980 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
981
982 /* Fill in the result: */
983 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
984 for (i = 0 ; i < 4; i++) {
985 if (mask & (1<<i)) {
986 dst = get_dst_reg(c, inst, i);
987 brw_MOV( p, dst, param );
988 }
989 }
990 if( inst->SaturateMode == SATURATE_ZERO_ONE )
991 brw_set_saturate( p, 0 );
992
993 release_tmps( c, mark );
994 }
995
996 static void noise2_sub( struct brw_wm_compile *c ) {
997
998 struct brw_compile *p = &c->func;
999 struct brw_reg param0, param1,
1000 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1001 t, tmp[ 4 ], /* float temporaries */
1002 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1003 int i;
1004 int mark = mark_tmps( c );
1005
1006 x0y0 = alloc_tmp( c );
1007 x0y1 = alloc_tmp( c );
1008 x1y0 = alloc_tmp( c );
1009 x1y1 = alloc_tmp( c );
1010 t = alloc_tmp( c );
1011 for( i = 0; i < 4; i++ ) {
1012 tmp[ i ] = alloc_tmp( c );
1013 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1014 }
1015 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1016 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1017 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1018
1019 param0 = lookup_tmp( c, mark - 3 );
1020 param1 = lookup_tmp( c, mark - 2 );
1021
1022 brw_set_access_mode( p, BRW_ALIGN_1 );
1023
1024 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1025 be hashed. Also compute the remainders (offsets within the unit
1026 square), interleaved to reduce register dependency penalties. */
1027 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1028 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1029 brw_FRC( p, param0, param0 );
1030 brw_FRC( p, param1, param1 );
1031 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1032 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1033 low_words( itmp[ 1 ] ) );
1034 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1035 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1036 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1037 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1038 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1039
1040 /* We're now ready to perform the hashing. The four hashes are
1041 interleaved for performance. The hash function used is
1042 designed to rapidly achieve avalanche and require only 32x16
1043 bit multiplication, and 16-bit swizzles (which we get for
1044 free). We can't use immediate operands in the multiplies,
1045 because immediates are permitted only in src1 and the 16-bit
1046 factor is permitted only in src0. */
1047 for( i = 0; i < 4; i++ )
1048 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1049 for( i = 0; i < 4; i++ )
1050 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1051 high_words( itmp[ i ] ) );
1052 for( i = 0; i < 4; i++ )
1053 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1054 for( i = 0; i < 4; i++ )
1055 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1056 high_words( itmp[ i ] ) );
1057 for( i = 0; i < 4; i++ )
1058 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1059 for( i = 0; i < 4; i++ )
1060 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1061 high_words( itmp[ i ] ) );
1062
1063 /* Now we want to initialise the four gradients based on the
1064 hashes. Format conversion from signed integer to float leaves
1065 everything scaled too high by a factor of pow( 2, 15 ), but
1066 we correct for that right at the end. */
1067 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1068 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1069 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1070 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1071 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1072
1073 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1074 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1075 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1076 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1077
1078 brw_MUL( p, x1y0, x1y0, t );
1079 brw_MUL( p, x1y1, x1y1, t );
1080 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1081 brw_MUL( p, x0y0, x0y0, param0 );
1082 brw_MUL( p, x0y1, x0y1, param0 );
1083
1084 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1085 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1086 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1087 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1088
1089 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1090 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1091 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1092 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1093
1094 /* We interpolate between the gradients using the polynomial
1095 6t^5 - 15t^4 + 10t^3 (Perlin). */
1096 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1097 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1098 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1099 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1100 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1101 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1102 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1103 pipeline */
1104 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1105 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1106 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1107 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1108 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1109 pipeline */
1110 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1111 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1112 brw_MUL( p, param0, tmp[ 0 ], param0 );
1113 brw_MUL( p, param1, tmp[ 1 ], param1 );
1114
1115 /* Here we interpolate in the y dimension... */
1116 brw_MUL( p, x0y1, x0y1, param1 );
1117 brw_MUL( p, x1y1, x1y1, param1 );
1118 brw_ADD( p, x0y0, x0y0, x0y1 );
1119 brw_ADD( p, x1y0, x1y0, x1y1 );
1120
1121 /* And now in x. There are horrible register dependencies here,
1122 but we have nothing else to do. */
1123 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1124 brw_MUL( p, x1y0, x1y0, param0 );
1125 brw_ADD( p, x0y0, x0y0, x1y0 );
1126
1127 /* scale by pow( 2, -15 ), as described above */
1128 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1129
1130 release_tmps( c, mark );
1131 }
1132
1133 static void emit_noise2( struct brw_wm_compile *c,
1134 const struct prog_instruction *inst )
1135 {
1136 struct brw_compile *p = &c->func;
1137 struct brw_reg src0, src1, param0, param1, dst;
1138 GLuint mask = inst->DstReg.WriteMask;
1139 int i;
1140 int mark = mark_tmps( c );
1141
1142 assert( mark == 0 );
1143
1144 src0 = get_src_reg( c, inst, 0, 0 );
1145 src1 = get_src_reg( c, inst, 0, 1 );
1146
1147 param0 = alloc_tmp( c );
1148 param1 = alloc_tmp( c );
1149
1150 brw_MOV( p, param0, src0 );
1151 brw_MOV( p, param1, src1 );
1152
1153 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1154
1155 /* Fill in the result: */
1156 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1157 for (i = 0 ; i < 4; i++) {
1158 if (mask & (1<<i)) {
1159 dst = get_dst_reg(c, inst, i);
1160 brw_MOV( p, dst, param0 );
1161 }
1162 }
1163 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1164 brw_set_saturate( p, 0 );
1165
1166 release_tmps( c, mark );
1167 }
1168
1169 /**
1170 * The three-dimensional case is much like the one- and two- versions above,
1171 * but since the number of corners is rapidly growing we now pack 16 16-bit
1172 * hashes into each register to extract more parallelism from the EUs.
1173 */
1174 static void noise3_sub( struct brw_wm_compile *c ) {
1175
1176 struct brw_compile *p = &c->func;
1177 struct brw_reg param0, param1, param2,
1178 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1179 xi, yi, zi, /* interpolation coefficients */
1180 t, tmp[ 8 ], /* float temporaries */
1181 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1182 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1183 int i;
1184 int mark = mark_tmps( c );
1185
1186 x0y0 = alloc_tmp( c );
1187 x0y1 = alloc_tmp( c );
1188 x1y0 = alloc_tmp( c );
1189 x1y1 = alloc_tmp( c );
1190 xi = alloc_tmp( c );
1191 yi = alloc_tmp( c );
1192 zi = alloc_tmp( c );
1193 t = alloc_tmp( c );
1194 for( i = 0; i < 8; i++ ) {
1195 tmp[ i ] = alloc_tmp( c );
1196 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1197 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1198 }
1199
1200 param0 = lookup_tmp( c, mark - 4 );
1201 param1 = lookup_tmp( c, mark - 3 );
1202 param2 = lookup_tmp( c, mark - 2 );
1203
1204 brw_set_access_mode( p, BRW_ALIGN_1 );
1205
1206 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1207 be hashed. Also compute the remainders (offsets within the unit
1208 cube), interleaved to reduce register dependency penalties. */
1209 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1210 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1211 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1212 brw_FRC( p, param0, param0 );
1213 brw_FRC( p, param1, param1 );
1214 brw_FRC( p, param2, param2 );
1215 /* Since we now have only 16 bits of precision in the hash, we must
1216 be more careful about thorough mixing to maintain entropy as we
1217 squash the input vector into a small scalar. */
1218 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1219 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1220 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1221 brw_imm_uw( 0x9B93 ) );
1222 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1223 brw_imm_uw( 0xBC8F ) );
1224
1225 /* Temporarily disable the execution mask while we work with ExecSize=16
1226 channels (the mask is set for ExecSize=8 and is probably incorrect).
1227 Although this might cause execution of unwanted channels, the code
1228 writes only to temporary registers and has no side effects, so
1229 disabling the mask is harmless. */
1230 brw_push_insn_state( p );
1231 brw_set_mask_control( p, BRW_MASK_DISABLE );
1232 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1233 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1234 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1235
1236 /* We're now ready to perform the hashing. The eight hashes are
1237 interleaved for performance. The hash function used is
1238 designed to rapidly achieve avalanche and require only 16x16
1239 bit multiplication, and 8-bit swizzles (which we get for
1240 free). */
1241 for( i = 0; i < 4; i++ )
1242 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1243 for( i = 0; i < 4; i++ )
1244 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1245 odd_bytes( wtmp[ i ] ) );
1246 for( i = 0; i < 4; i++ )
1247 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1248 for( i = 0; i < 4; i++ )
1249 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1250 odd_bytes( wtmp[ i ] ) );
1251 brw_pop_insn_state( p );
1252
1253 /* Now we want to initialise the four rear gradients based on the
1254 hashes. Format conversion from signed integer to float leaves
1255 everything scaled too high by a factor of pow( 2, 15 ), but
1256 we correct for that right at the end. */
1257 /* x component */
1258 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1259 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1260 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1261 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1262 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1263
1264 brw_push_insn_state( p );
1265 brw_set_mask_control( p, BRW_MASK_DISABLE );
1266 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1267 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1268 brw_pop_insn_state( p );
1269
1270 brw_MUL( p, x1y0, x1y0, t );
1271 brw_MUL( p, x1y1, x1y1, t );
1272 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1273 brw_MUL( p, x0y0, x0y0, param0 );
1274 brw_MUL( p, x0y1, x0y1, param0 );
1275
1276 /* y component */
1277 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1278 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1279 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1280 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1281
1282 brw_push_insn_state( p );
1283 brw_set_mask_control( p, BRW_MASK_DISABLE );
1284 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1285 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1286 brw_pop_insn_state( p );
1287
1288 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1289 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1290 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1291 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1292 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1293
1294 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1295 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1296 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1297 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1298
1299 /* z component */
1300 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1301 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1302 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1303 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1304
1305 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1306 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1307 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1308 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1309
1310 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1311 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1312 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1313 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1314
1315 /* We interpolate between the gradients using the polynomial
1316 6t^5 - 15t^4 + 10t^3 (Perlin). */
1317 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1318 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1319 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1320 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1321 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1322 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1323 brw_MUL( p, xi, xi, param0 );
1324 brw_MUL( p, yi, yi, param1 );
1325 brw_MUL( p, zi, zi, param2 );
1326 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1327 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1328 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1329 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1330 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1331 brw_MUL( p, xi, xi, param0 );
1332 brw_MUL( p, yi, yi, param1 );
1333 brw_MUL( p, zi, zi, param2 );
1334 brw_MUL( p, xi, xi, param0 );
1335 brw_MUL( p, yi, yi, param1 );
1336 brw_MUL( p, zi, zi, param2 );
1337 brw_MUL( p, xi, xi, param0 );
1338 brw_MUL( p, yi, yi, param1 );
1339 brw_MUL( p, zi, zi, param2 );
1340
1341 /* Here we interpolate in the y dimension... */
1342 brw_MUL( p, x0y1, x0y1, yi );
1343 brw_MUL( p, x1y1, x1y1, yi );
1344 brw_ADD( p, x0y0, x0y0, x0y1 );
1345 brw_ADD( p, x1y0, x1y0, x1y1 );
1346
1347 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1348 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1349 brw_MUL( p, x1y0, x1y0, xi );
1350 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1351
1352 /* Now do the same thing for the front four gradients... */
1353 /* x component */
1354 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1355 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1356 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1357 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1358
1359 brw_push_insn_state( p );
1360 brw_set_mask_control( p, BRW_MASK_DISABLE );
1361 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1362 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1363 brw_pop_insn_state( p );
1364
1365 brw_MUL( p, x1y0, x1y0, t );
1366 brw_MUL( p, x1y1, x1y1, t );
1367 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1368 brw_MUL( p, x0y0, x0y0, param0 );
1369 brw_MUL( p, x0y1, x0y1, param0 );
1370
1371 /* y component */
1372 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1373 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1374 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1375 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1376
1377 brw_push_insn_state( p );
1378 brw_set_mask_control( p, BRW_MASK_DISABLE );
1379 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1380 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1381 brw_pop_insn_state( p );
1382
1383 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1384 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1385 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1386 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1387 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1388
1389 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1390 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1391 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1392 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1393
1394 /* z component */
1395 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1396 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1397 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1398 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1399
1400 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1401 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1402 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1403 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1404
1405 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1406 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1407 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1408 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1409
1410 /* The interpolation coefficients are still around from last time, so
1411 again interpolate in the y dimension... */
1412 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1413 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1414 brw_MUL( p, x0y1, x0y1, yi );
1415 brw_MUL( p, x1y1, x1y1, yi );
1416 brw_ADD( p, x0y0, x0y0, x0y1 );
1417 brw_ADD( p, x1y0, x1y0, x1y1 );
1418
1419 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1420 time put the front face in tmp[ 1 ] and we're nearly there... */
1421 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1422 brw_MUL( p, x1y0, x1y0, xi );
1423 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1424
1425 /* The final interpolation, in the z dimension: */
1426 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1427 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1428 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1429
1430 /* scale by pow( 2, -15 ), as described above */
1431 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1432
1433 release_tmps( c, mark );
1434 }
1435
1436 static void emit_noise3( struct brw_wm_compile *c,
1437 const struct prog_instruction *inst )
1438 {
1439 struct brw_compile *p = &c->func;
1440 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1441 GLuint mask = inst->DstReg.WriteMask;
1442 int i;
1443 int mark = mark_tmps( c );
1444
1445 assert( mark == 0 );
1446
1447 src0 = get_src_reg( c, inst, 0, 0 );
1448 src1 = get_src_reg( c, inst, 0, 1 );
1449 src2 = get_src_reg( c, inst, 0, 2 );
1450
1451 param0 = alloc_tmp( c );
1452 param1 = alloc_tmp( c );
1453 param2 = alloc_tmp( c );
1454
1455 brw_MOV( p, param0, src0 );
1456 brw_MOV( p, param1, src1 );
1457 brw_MOV( p, param2, src2 );
1458
1459 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1460
1461 /* Fill in the result: */
1462 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1463 for (i = 0 ; i < 4; i++) {
1464 if (mask & (1<<i)) {
1465 dst = get_dst_reg(c, inst, i);
1466 brw_MOV( p, dst, param0 );
1467 }
1468 }
1469 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1470 brw_set_saturate( p, 0 );
1471
1472 release_tmps( c, mark );
1473 }
1474
1475 /**
1476 * For the four-dimensional case, the little micro-optimisation benefits
1477 * we obtain by unrolling all the loops aren't worth the massive bloat it
1478 * now causes. Instead, we loop twice around performing a similar operation
1479 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1480 * code to glue it all together.
1481 */
1482 static void noise4_sub( struct brw_wm_compile *c )
1483 {
1484 struct brw_compile *p = &c->func;
1485 struct brw_reg param[ 4 ],
1486 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1487 w0, /* noise for the w=0 cube */
1488 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1489 interp[ 4 ], /* interpolation coefficients */
1490 t, tmp[ 8 ], /* float temporaries */
1491 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1492 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1493 int i, j;
1494 int mark = mark_tmps( c );
1495 GLuint loop, origin;
1496
1497 x0y0 = alloc_tmp( c );
1498 x0y1 = alloc_tmp( c );
1499 x1y0 = alloc_tmp( c );
1500 x1y1 = alloc_tmp( c );
1501 t = alloc_tmp( c );
1502 w0 = alloc_tmp( c );
1503 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1504 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1505
1506 for( i = 0; i < 4; i++ ) {
1507 param[ i ] = lookup_tmp( c, mark - 5 + i );
1508 interp[ i ] = alloc_tmp( c );
1509 }
1510
1511 for( i = 0; i < 8; i++ ) {
1512 tmp[ i ] = alloc_tmp( c );
1513 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1514 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1515 }
1516
1517 brw_set_access_mode( p, BRW_ALIGN_1 );
1518
1519 /* We only want 16 bits of precision from the integral part of each
1520 co-ordinate, but unfortunately the RNDD semantics would saturate
1521 at 16 bits if we performed the operation directly to a 16-bit
1522 destination. Therefore, we round to 32-bit temporaries where
1523 appropriate, and then store only the lower 16 bits. */
1524 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1525 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1526 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1527 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1528 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1529 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1530
1531 /* Modify the flag register here, because the side effect is useful
1532 later (see below). We know for certain that all flags will be
1533 cleared, since the FRC instruction cannot possibly generate
1534 negative results. Even for exceptional inputs (infinities, denormals,
1535 NaNs), the architecture guarantees that the L conditional is false. */
1536 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1537 brw_FRC( p, param[ 0 ], param[ 0 ] );
1538 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1539 for( i = 1; i < 4; i++ )
1540 brw_FRC( p, param[ i ], param[ i ] );
1541
1542 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1543 of all. */
1544 for( i = 0; i < 4; i++ )
1545 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1546 for( i = 0; i < 4; i++ )
1547 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1548 for( i = 0; i < 4; i++ )
1549 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1550 for( i = 0; i < 4; i++ )
1551 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1552 for( j = 0; j < 3; j++ )
1553 for( i = 0; i < 4; i++ )
1554 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1555
1556 /* Mark the current address, as it will be a jump destination. The
1557 following code will be executed twice: first, with the flag
1558 register clear indicating the w=0 case, and second with flags
1559 set for w=1. */
1560 loop = p->nr_insn;
1561
1562 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1563 be hashed. Since we have only 16 bits of precision in the hash, we
1564 must be careful about thorough mixing to maintain entropy as we
1565 squash the input vector into a small scalar. */
1566 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1567 brw_imm_uw( 0xBC8F ) );
1568 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1569 brw_imm_uw( 0xD0BD ) );
1570 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1571 brw_imm_uw( 0x9B93 ) );
1572 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1573 brw_imm_uw( 0xA359 ) );
1574 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1575 brw_imm_uw( 0xBC8F ) );
1576
1577 /* Temporarily disable the execution mask while we work with ExecSize=16
1578 channels (the mask is set for ExecSize=8 and is probably incorrect).
1579 Although this might cause execution of unwanted channels, the code
1580 writes only to temporary registers and has no side effects, so
1581 disabling the mask is harmless. */
1582 brw_push_insn_state( p );
1583 brw_set_mask_control( p, BRW_MASK_DISABLE );
1584 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1585 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1586 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1587
1588 /* We're now ready to perform the hashing. The eight hashes are
1589 interleaved for performance. The hash function used is
1590 designed to rapidly achieve avalanche and require only 16x16
1591 bit multiplication, and 8-bit swizzles (which we get for
1592 free). */
1593 for( i = 0; i < 4; i++ )
1594 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1595 for( i = 0; i < 4; i++ )
1596 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1597 odd_bytes( wtmp[ i ] ) );
1598 for( i = 0; i < 4; i++ )
1599 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1600 for( i = 0; i < 4; i++ )
1601 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1602 odd_bytes( wtmp[ i ] ) );
1603 brw_pop_insn_state( p );
1604
1605 /* Now we want to initialise the four rear gradients based on the
1606 hashes. Format conversion from signed integer to float leaves
1607 everything scaled too high by a factor of pow( 2, 15 ), but
1608 we correct for that right at the end. */
1609 /* x component */
1610 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1611 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1612 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1613 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1614 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1615
1616 brw_push_insn_state( p );
1617 brw_set_mask_control( p, BRW_MASK_DISABLE );
1618 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1619 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1620 brw_pop_insn_state( p );
1621
1622 brw_MUL( p, x1y0, x1y0, t );
1623 brw_MUL( p, x1y1, x1y1, t );
1624 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1625 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1626 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1627
1628 /* y component */
1629 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1630 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1631 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1632 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1633
1634 brw_push_insn_state( p );
1635 brw_set_mask_control( p, BRW_MASK_DISABLE );
1636 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1637 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1638 brw_pop_insn_state( p );
1639
1640 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1641 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1642 /* prepare t for the w component (used below): w the first time through
1643 the loop; w - 1 the second time) */
1644 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1645 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1646 p->current->header.predicate_inverse = 1;
1647 brw_MOV( p, t, param[ 3 ] );
1648 p->current->header.predicate_inverse = 0;
1649 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1650 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1651 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1652
1653 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1654 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1655 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1656 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1657
1658 /* z component */
1659 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1660 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1661 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1662 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1663
1664 brw_push_insn_state( p );
1665 brw_set_mask_control( p, BRW_MASK_DISABLE );
1666 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1667 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1668 brw_pop_insn_state( p );
1669
1670 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1671 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1672 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1673 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1674
1675 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1676 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1677 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1678 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1679
1680 /* w component */
1681 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1682 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1683 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1684 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1685
1686 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1687 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1688 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1689 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1690 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1691
1692 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1693 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1694 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1695 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1696
1697 /* Here we interpolate in the y dimension... */
1698 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1699 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1700 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1701 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1702 brw_ADD( p, x0y0, x0y0, x0y1 );
1703 brw_ADD( p, x1y0, x1y0, x1y1 );
1704
1705 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1706 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1707 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1708 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1709
1710 /* Now do the same thing for the front four gradients... */
1711 /* x component */
1712 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1713 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1714 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1715 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1716
1717 brw_push_insn_state( p );
1718 brw_set_mask_control( p, BRW_MASK_DISABLE );
1719 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1720 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1721 brw_pop_insn_state( p );
1722
1723 brw_MUL( p, x1y0, x1y0, t );
1724 brw_MUL( p, x1y1, x1y1, t );
1725 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1726 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1727 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1728
1729 /* y component */
1730 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1731 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1732 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1733 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1734
1735 brw_push_insn_state( p );
1736 brw_set_mask_control( p, BRW_MASK_DISABLE );
1737 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1738 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1739 brw_pop_insn_state( p );
1740
1741 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1742 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1743 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1744 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1745 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1746
1747 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1748 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1749 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1750 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1751
1752 /* z component */
1753 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1754 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1755 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1756 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1757
1758 brw_push_insn_state( p );
1759 brw_set_mask_control( p, BRW_MASK_DISABLE );
1760 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1761 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1762 brw_pop_insn_state( p );
1763
1764 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1765 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1766 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1767 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1768 /* prepare t for the w component (used below): w the first time through
1769 the loop; w - 1 the second time) */
1770 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1771 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1772 p->current->header.predicate_inverse = 1;
1773 brw_MOV( p, t, param[ 3 ] );
1774 p->current->header.predicate_inverse = 0;
1775 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1776
1777 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1778 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1779 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1780 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1781
1782 /* w component */
1783 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1784 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1785 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1786 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1787
1788 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1789 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1790 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1791 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1792
1793 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1794 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1795 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1796 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1797
1798 /* Interpolate in the y dimension: */
1799 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1800 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1801 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1802 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1803 brw_ADD( p, x0y0, x0y0, x0y1 );
1804 brw_ADD( p, x1y0, x1y0, x1y1 );
1805
1806 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1807 time put the front face in tmp[ 1 ] and we're nearly there... */
1808 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1809 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1810 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1811
1812 /* Another interpolation, in the z dimension: */
1813 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1814 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
1815 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1816
1817 /* Exit the loop if we've computed both cubes... */
1818 origin = p->nr_insn;
1819 brw_push_insn_state( p );
1820 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1821 brw_set_mask_control( p, BRW_MASK_DISABLE );
1822 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
1823 brw_pop_insn_state( p );
1824
1825 /* Save the result for the w=0 case, and increment the w coordinate: */
1826 brw_MOV( p, w0, tmp[ 0 ] );
1827 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
1828 brw_imm_uw( 1 ) );
1829
1830 /* Loop around for the other cube. Explicitly set the flag register
1831 (unfortunately we must spend an extra instruction to do this: we
1832 can't rely on a side effect of the previous MOV or ADD because
1833 conditional modifiers which are normally true might be false in
1834 exceptional circumstances, e.g. given a NaN input; the add to
1835 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
1836 brw_push_insn_state( p );
1837 brw_set_mask_control( p, BRW_MASK_DISABLE );
1838 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
1839 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
1840 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
1841 brw_pop_insn_state( p );
1842
1843 /* Patch the previous conditional branch now that we know the
1844 destination address. */
1845 brw_set_src1( p->store + origin,
1846 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
1847
1848 /* The very last interpolation. */
1849 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
1850 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
1851 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
1852
1853 /* scale by pow( 2, -15 ), as described above */
1854 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1855
1856 release_tmps( c, mark );
1857 }
1858
1859 static void emit_noise4( struct brw_wm_compile *c,
1860 const struct prog_instruction *inst )
1861 {
1862 struct brw_compile *p = &c->func;
1863 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
1864 GLuint mask = inst->DstReg.WriteMask;
1865 int i;
1866 int mark = mark_tmps( c );
1867
1868 assert( mark == 0 );
1869
1870 src0 = get_src_reg( c, inst, 0, 0 );
1871 src1 = get_src_reg( c, inst, 0, 1 );
1872 src2 = get_src_reg( c, inst, 0, 2 );
1873 src3 = get_src_reg( c, inst, 0, 3 );
1874
1875 param0 = alloc_tmp( c );
1876 param1 = alloc_tmp( c );
1877 param2 = alloc_tmp( c );
1878 param3 = alloc_tmp( c );
1879
1880 brw_MOV( p, param0, src0 );
1881 brw_MOV( p, param1, src1 );
1882 brw_MOV( p, param2, src2 );
1883 brw_MOV( p, param3, src3 );
1884
1885 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
1886
1887 /* Fill in the result: */
1888 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1889 for (i = 0 ; i < 4; i++) {
1890 if (mask & (1<<i)) {
1891 dst = get_dst_reg(c, inst, i);
1892 brw_MOV( p, dst, param0 );
1893 }
1894 }
1895 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1896 brw_set_saturate( p, 0 );
1897
1898 release_tmps( c, mark );
1899 }
1900
1901
1902 /* TODO
1903 BIAS on SIMD8 not working yet...
1904 */
1905 static void emit_txb(struct brw_wm_compile *c,
1906 const struct prog_instruction *inst)
1907 {
1908 struct brw_compile *p = &c->func;
1909 struct brw_reg dst[4], src[4], payload_reg;
1910 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
1911 const GLuint unit = inst->TexSrcUnit;
1912 GLuint i;
1913 GLuint msg_type;
1914
1915 assert(unit < BRW_MAX_TEX_UNIT);
1916
1917 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
1918
1919 for (i = 0; i < 4; i++)
1920 dst[i] = get_dst_reg(c, inst, i);
1921 for (i = 0; i < 4; i++)
1922 src[i] = get_src_reg(c, inst, 0, i);
1923
1924 switch (inst->TexSrcTarget) {
1925 case TEXTURE_1D_INDEX:
1926 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
1927 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
1928 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
1929 break;
1930 case TEXTURE_2D_INDEX:
1931 case TEXTURE_RECT_INDEX:
1932 brw_MOV(p, brw_message_reg(2), src[0]);
1933 brw_MOV(p, brw_message_reg(3), src[1]);
1934 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
1935 break;
1936 case TEXTURE_3D_INDEX:
1937 case TEXTURE_CUBE_INDEX:
1938 brw_MOV(p, brw_message_reg(2), src[0]);
1939 brw_MOV(p, brw_message_reg(3), src[1]);
1940 brw_MOV(p, brw_message_reg(4), src[2]);
1941 break;
1942 default:
1943 /* invalid target */
1944 abort();
1945 }
1946 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
1947 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
1948
1949 if (BRW_IS_IGDNG(p->brw)) {
1950 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
1951 } else {
1952 /* Does it work well on SIMD8? */
1953 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1954 }
1955
1956 brw_SAMPLE(p,
1957 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
1958 1, /* msg_reg_nr */
1959 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
1960 SURF_INDEX_TEXTURE(unit),
1961 unit, /* sampler */
1962 inst->DstReg.WriteMask, /* writemask */
1963 msg_type, /* msg_type */
1964 4, /* response_length */
1965 4, /* msg_length */
1966 0, /* eot */
1967 1,
1968 BRW_SAMPLER_SIMD_MODE_SIMD8);
1969 }
1970
1971
1972 static void emit_tex(struct brw_wm_compile *c,
1973 const struct prog_instruction *inst)
1974 {
1975 struct brw_compile *p = &c->func;
1976 struct brw_reg dst[4], src[4], payload_reg;
1977 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
1978 const GLuint unit = inst->TexSrcUnit;
1979 GLuint msg_len;
1980 GLuint i, nr;
1981 GLuint emit;
1982 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
1983 GLuint msg_type;
1984
1985 assert(unit < BRW_MAX_TEX_UNIT);
1986
1987 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
1988
1989 for (i = 0; i < 4; i++)
1990 dst[i] = get_dst_reg(c, inst, i);
1991 for (i = 0; i < 4; i++)
1992 src[i] = get_src_reg(c, inst, 0, i);
1993
1994 switch (inst->TexSrcTarget) {
1995 case TEXTURE_1D_INDEX:
1996 emit = WRITEMASK_X;
1997 nr = 1;
1998 break;
1999 case TEXTURE_2D_INDEX:
2000 case TEXTURE_RECT_INDEX:
2001 emit = WRITEMASK_XY;
2002 nr = 2;
2003 break;
2004 case TEXTURE_3D_INDEX:
2005 case TEXTURE_CUBE_INDEX:
2006 emit = WRITEMASK_XYZ;
2007 nr = 3;
2008 break;
2009 default:
2010 /* invalid target */
2011 abort();
2012 }
2013 msg_len = 1;
2014
2015 /* move/load S, T, R coords */
2016 for (i = 0; i < nr; i++) {
2017 static const GLuint swz[4] = {0,1,2,2};
2018 if (emit & (1<<i))
2019 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2020 else
2021 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2022 msg_len += 1;
2023 }
2024
2025 if (shadow) {
2026 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2027 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2028 }
2029
2030 if (BRW_IS_IGDNG(p->brw)) {
2031 if (shadow)
2032 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2033 else
2034 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2035 } else {
2036 /* Does it work for shadow on SIMD8 ? */
2037 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2038 }
2039
2040 brw_SAMPLE(p,
2041 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2042 1, /* msg_reg_nr */
2043 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2044 SURF_INDEX_TEXTURE(unit),
2045 unit, /* sampler */
2046 inst->DstReg.WriteMask, /* writemask */
2047 msg_type, /* msg_type */
2048 4, /* response_length */
2049 shadow ? 6 : 4, /* msg_length */
2050 0, /* eot */
2051 1,
2052 BRW_SAMPLER_SIMD_MODE_SIMD8);
2053
2054 if (shadow)
2055 brw_MOV(p, dst[3], brw_imm_f(1.0));
2056 }
2057
2058
2059 /**
2060 * Resolve subroutine calls after code emit is done.
2061 */
2062 static void post_wm_emit( struct brw_wm_compile *c )
2063 {
2064 brw_resolve_cals(&c->func);
2065 }
2066
2067 static void
2068 get_argument_regs(struct brw_wm_compile *c,
2069 const struct prog_instruction *inst,
2070 int index,
2071 struct brw_reg *regs,
2072 int mask)
2073 {
2074 int i;
2075
2076 for (i = 0; i < 4; i++) {
2077 if (mask & (1 << i))
2078 regs[i] = get_src_reg(c, inst, index, i);
2079 }
2080 }
2081
2082 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2083 {
2084 #define MAX_IF_DEPTH 32
2085 #define MAX_LOOP_DEPTH 32
2086 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2087 GLuint i, if_depth = 0, loop_depth = 0;
2088 struct brw_compile *p = &c->func;
2089 struct brw_indirect stack_index = brw_indirect(0, 0);
2090
2091 c->out_of_regs = GL_FALSE;
2092
2093 prealloc_reg(c);
2094 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2095 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2096
2097 for (i = 0; i < c->nr_fp_insns; i++) {
2098 const struct prog_instruction *inst = &c->prog_instructions[i];
2099 int dst_flags;
2100 struct brw_reg args[3][4], dst[4];
2101 int j;
2102
2103 c->cur_inst = i;
2104
2105 #if 0
2106 _mesa_printf("Inst %d: ", i);
2107 _mesa_print_instruction(inst);
2108 #endif
2109
2110 /* fetch any constants that this instruction needs */
2111 if (c->fp->use_const_buffer)
2112 fetch_constants(c, inst);
2113
2114 if (inst->Opcode != OPCODE_ARL) {
2115 for (j = 0; j < 4; j++) {
2116 if (inst->DstReg.WriteMask & (1 << j))
2117 dst[j] = get_dst_reg(c, inst, j);
2118 else
2119 dst[j] = brw_null_reg();
2120 }
2121 }
2122 for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
2123 get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
2124
2125 dst_flags = inst->DstReg.WriteMask;
2126 if (inst->SaturateMode == SATURATE_ZERO_ONE)
2127 dst_flags |= SATURATE;
2128
2129 if (inst->CondUpdate)
2130 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2131 else
2132 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2133
2134 dst_flags = inst->DstReg.WriteMask;
2135 if (inst->SaturateMode == SATURATE_ZERO_ONE)
2136 dst_flags |= SATURATE;
2137
2138 switch (inst->Opcode) {
2139 case WM_PIXELXY:
2140 emit_pixel_xy(c, dst, dst_flags);
2141 break;
2142 case WM_DELTAXY:
2143 emit_delta_xy(p, dst, dst_flags, args[0]);
2144 break;
2145 case WM_PIXELW:
2146 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
2147 break;
2148 case WM_LINTERP:
2149 emit_linterp(p, dst, dst_flags, args[0], args[1]);
2150 break;
2151 case WM_PINTERP:
2152 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
2153 break;
2154 case WM_CINTERP:
2155 emit_cinterp(p, dst, dst_flags, args[0]);
2156 break;
2157 case WM_WPOSXY:
2158 emit_wpos_xy(c, dst, dst_flags, args[0]);
2159 break;
2160 case WM_FB_WRITE:
2161 emit_fb_write(c, inst);
2162 break;
2163 case WM_FRONTFACING:
2164 emit_frontfacing(p, dst, dst_flags);
2165 break;
2166 case OPCODE_ADD:
2167 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
2168 break;
2169 case OPCODE_ARL:
2170 emit_arl(c, inst);
2171 break;
2172 case OPCODE_FRC:
2173 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
2174 break;
2175 case OPCODE_FLR:
2176 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
2177 break;
2178 case OPCODE_LRP:
2179 unalias3(c, emit_lrp,
2180 dst, dst_flags, args[0], args[1], args[2]);
2181 break;
2182 case OPCODE_TRUNC:
2183 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
2184 break;
2185 case OPCODE_MOV:
2186 case OPCODE_SWZ:
2187 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
2188 break;
2189 case OPCODE_DP3:
2190 emit_dp3(p, dst, dst_flags, args[0], args[1]);
2191 break;
2192 case OPCODE_DP4:
2193 emit_dp4(p, dst, dst_flags, args[0], args[1]);
2194 break;
2195 case OPCODE_XPD:
2196 emit_xpd(p, dst, dst_flags, args[0], args[1]);
2197 break;
2198 case OPCODE_DPH:
2199 emit_dph(p, dst, dst_flags, args[0], args[1]);
2200 break;
2201 case OPCODE_RCP:
2202 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
2203 break;
2204 case OPCODE_RSQ:
2205 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
2206 break;
2207 case OPCODE_SIN:
2208 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
2209 break;
2210 case OPCODE_COS:
2211 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
2212 break;
2213 case OPCODE_EX2:
2214 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
2215 break;
2216 case OPCODE_LG2:
2217 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
2218 break;
2219 case OPCODE_MIN:
2220 case OPCODE_MAX:
2221 emit_min_max(c, inst);
2222 break;
2223 case OPCODE_DDX:
2224 case OPCODE_DDY:
2225 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
2226 args[0]);
2227 break;
2228 case OPCODE_SLT:
2229 emit_sop(p, dst, dst_flags,
2230 BRW_CONDITIONAL_L, args[0], args[1]);
2231 break;
2232 case OPCODE_SLE:
2233 emit_sop(p, dst, dst_flags,
2234 BRW_CONDITIONAL_LE, args[0], args[1]);
2235 break;
2236 case OPCODE_SGT:
2237 emit_sop(p, dst, dst_flags,
2238 BRW_CONDITIONAL_G, args[0], args[1]);
2239 break;
2240 case OPCODE_SGE:
2241 emit_sop(p, dst, dst_flags,
2242 BRW_CONDITIONAL_GE, args[0], args[1]);
2243 break;
2244 case OPCODE_SEQ:
2245 emit_sop(p, dst, dst_flags,
2246 BRW_CONDITIONAL_EQ, args[0], args[1]);
2247 break;
2248 case OPCODE_SNE:
2249 emit_sop(p, dst, dst_flags,
2250 BRW_CONDITIONAL_NEQ, args[0], args[1]);
2251 break;
2252 case OPCODE_MUL:
2253 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2254 break;
2255 case OPCODE_POW:
2256 emit_math2(c, BRW_MATH_FUNCTION_POW,
2257 dst, dst_flags, args[0], args[1]);
2258 break;
2259 case OPCODE_MAD:
2260 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
2261 break;
2262 case OPCODE_NOISE1:
2263 emit_noise1(c, inst);
2264 break;
2265 case OPCODE_NOISE2:
2266 emit_noise2(c, inst);
2267 break;
2268 case OPCODE_NOISE3:
2269 emit_noise3(c, inst);
2270 break;
2271 case OPCODE_NOISE4:
2272 emit_noise4(c, inst);
2273 break;
2274 case OPCODE_TEX:
2275 emit_tex(c, inst);
2276 break;
2277 case OPCODE_TXB:
2278 emit_txb(c, inst);
2279 break;
2280 case OPCODE_KIL_NV:
2281 emit_kil(c);
2282 break;
2283 case OPCODE_IF:
2284 assert(if_depth < MAX_IF_DEPTH);
2285 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2286 break;
2287 case OPCODE_ELSE:
2288 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2289 break;
2290 case OPCODE_ENDIF:
2291 assert(if_depth > 0);
2292 brw_ENDIF(p, if_inst[--if_depth]);
2293 break;
2294 case OPCODE_BGNSUB:
2295 brw_save_label(p, inst->Comment, p->nr_insn);
2296 break;
2297 case OPCODE_ENDSUB:
2298 /* no-op */
2299 break;
2300 case OPCODE_CAL:
2301 brw_push_insn_state(p);
2302 brw_set_mask_control(p, BRW_MASK_DISABLE);
2303 brw_set_access_mode(p, BRW_ALIGN_1);
2304 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2305 brw_set_access_mode(p, BRW_ALIGN_16);
2306 brw_ADD(p, get_addr_reg(stack_index),
2307 get_addr_reg(stack_index), brw_imm_d(4));
2308 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2309 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2310 brw_pop_insn_state(p);
2311 break;
2312
2313 case OPCODE_RET:
2314 brw_push_insn_state(p);
2315 brw_set_mask_control(p, BRW_MASK_DISABLE);
2316 brw_ADD(p, get_addr_reg(stack_index),
2317 get_addr_reg(stack_index), brw_imm_d(-4));
2318 brw_set_access_mode(p, BRW_ALIGN_1);
2319 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2320 brw_set_access_mode(p, BRW_ALIGN_16);
2321 brw_pop_insn_state(p);
2322
2323 break;
2324 case OPCODE_BGNLOOP:
2325 /* XXX may need to invalidate the current_constant regs */
2326 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2327 break;
2328 case OPCODE_BRK:
2329 brw_BREAK(p);
2330 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2331 break;
2332 case OPCODE_CONT:
2333 brw_CONT(p);
2334 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2335 break;
2336 case OPCODE_ENDLOOP:
2337 {
2338 struct brw_instruction *inst0, *inst1;
2339 GLuint br = 1;
2340
2341 if (BRW_IS_IGDNG(brw))
2342 br = 2;
2343
2344 loop_depth--;
2345 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2346 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2347 while (inst0 > loop_inst[loop_depth]) {
2348 inst0--;
2349 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2350 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2351 inst0->bits3.if_else.pop_count = 0;
2352 }
2353 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2354 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2355 inst0->bits3.if_else.pop_count = 0;
2356 }
2357 }
2358 }
2359 break;
2360 default:
2361 _mesa_printf("unsupported IR in fragment shader %d\n",
2362 inst->Opcode);
2363 }
2364
2365 if (inst->CondUpdate)
2366 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2367 else
2368 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2369 }
2370 post_wm_emit(c);
2371
2372 if (INTEL_DEBUG & DEBUG_WM) {
2373 _mesa_printf("wm-native:\n");
2374 for (i = 0; i < p->nr_insn; i++)
2375 brw_disasm(stderr, &p->store[i]);
2376 _mesa_printf("\n");
2377 }
2378 }
2379
2380 /**
2381 * Do GPU code generation for shaders that use GLSL features such as
2382 * flow control. Other shaders will be compiled with the
2383 */
2384 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2385 {
2386 if (INTEL_DEBUG & DEBUG_WM) {
2387 _mesa_printf("brw_wm_glsl_emit:\n");
2388 }
2389
2390 /* initial instruction translation/simplification */
2391 brw_wm_pass_fp(c);
2392
2393 /* actual code generation */
2394 brw_wm_emit_glsl(brw, c);
2395
2396 if (INTEL_DEBUG & DEBUG_WM) {
2397 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2398 }
2399
2400 c->prog_data.total_grf = num_grf_used(c);
2401 c->prog_data.total_scratch = 0;
2402 }