f480609b532f674516228a4d9bc70139605c8240
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13
14 /**
15 * Determine if the given fragment program uses GLSL features such
16 * as flow conditionals, loops, subroutines.
17 * Some GLSL shaders may use these features, others might not.
18 */
19 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
20 {
21 int i;
22 for (i = 0; i < fp->Base.NumInstructions; i++) {
23 const struct prog_instruction *inst = &fp->Base.Instructions[i];
24 switch (inst->Opcode) {
25 case OPCODE_ARL:
26 case OPCODE_IF:
27 case OPCODE_ENDIF:
28 case OPCODE_CAL:
29 case OPCODE_BRK:
30 case OPCODE_RET:
31 case OPCODE_DDX:
32 case OPCODE_DDY:
33 case OPCODE_NOISE1:
34 case OPCODE_NOISE2:
35 case OPCODE_NOISE3:
36 case OPCODE_NOISE4:
37 case OPCODE_BGNLOOP:
38 return GL_TRUE;
39 default:
40 break;
41 }
42 }
43 return GL_FALSE;
44 }
45
46
47
48 static void
49 reclaim_temps(struct brw_wm_compile *c);
50
51
52 /** Mark GRF register as used. */
53 static void
54 prealloc_grf(struct brw_wm_compile *c, int r)
55 {
56 c->used_grf[r] = GL_TRUE;
57 }
58
59
60 /** Mark given GRF register as not in use. */
61 static void
62 release_grf(struct brw_wm_compile *c, int r)
63 {
64 /*assert(c->used_grf[r]);*/
65 c->used_grf[r] = GL_FALSE;
66 c->first_free_grf = MIN2(c->first_free_grf, r);
67 }
68
69
70 /** Return index of a free GRF, mark it as used. */
71 static int
72 alloc_grf(struct brw_wm_compile *c)
73 {
74 GLuint r;
75 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
76 if (!c->used_grf[r]) {
77 c->used_grf[r] = GL_TRUE;
78 c->first_free_grf = r + 1; /* a guess */
79 return r;
80 }
81 }
82
83 /* no free temps, try to reclaim some */
84 reclaim_temps(c);
85 c->first_free_grf = 0;
86
87 /* try alloc again */
88 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
89 if (!c->used_grf[r]) {
90 c->used_grf[r] = GL_TRUE;
91 c->first_free_grf = r + 1; /* a guess */
92 return r;
93 }
94 }
95
96 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
97 assert(c->used_grf[r]);
98 }
99
100 /* really, no free GRF regs found */
101 if (!c->out_of_regs) {
102 /* print warning once per compilation */
103 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
104 c->out_of_regs = GL_TRUE;
105 }
106
107 return -1;
108 }
109
110
111 /** Return number of GRF registers used */
112 static int
113 num_grf_used(const struct brw_wm_compile *c)
114 {
115 int r;
116 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
117 if (c->used_grf[r])
118 return r + 1;
119 return 0;
120 }
121
122
123
124 /**
125 * Record the mapping of a Mesa register to a hardware register.
126 */
127 static void set_reg(struct brw_wm_compile *c, int file, int index,
128 int component, struct brw_reg reg)
129 {
130 c->wm_regs[file][index][component].reg = reg;
131 c->wm_regs[file][index][component].inited = GL_TRUE;
132 }
133
134 /**
135 * Examine instruction's write mask to find index of first component
136 * enabled for writing.
137 */
138 static int get_scalar_dst_index(const struct prog_instruction *inst)
139 {
140 int i;
141 for (i = 0; i < 4; i++)
142 if (inst->DstReg.WriteMask & (1<<i))
143 break;
144 return i;
145 }
146
147 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
148 {
149 struct brw_reg reg;
150
151 /* if we need to allocate another temp, grow the tmp_regs[] array */
152 if (c->tmp_index == c->tmp_max) {
153 int r = alloc_grf(c);
154 if (r < 0) {
155 /*printf("Out of temps in %s\n", __FUNCTION__);*/
156 r = 50; /* XXX random register! */
157 }
158 c->tmp_regs[ c->tmp_max++ ] = r;
159 }
160
161 /* form the GRF register */
162 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
163 /*printf("alloc_temp %d\n", reg.nr);*/
164 assert(reg.nr < BRW_WM_MAX_GRF);
165 return reg;
166
167 }
168
169 /**
170 * Save current temp register info.
171 * There must be a matching call to release_tmps().
172 */
173 static int mark_tmps(struct brw_wm_compile *c)
174 {
175 return c->tmp_index;
176 }
177
178 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
179 {
180 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
181 }
182
183 static void release_tmps(struct brw_wm_compile *c, int mark)
184 {
185 c->tmp_index = mark;
186 }
187
188 /**
189 * Convert Mesa src register to brw register.
190 *
191 * Since we're running in SOA mode each Mesa register corresponds to four
192 * hardware registers. We allocate the hardware registers as needed here.
193 *
194 * \param file register file, one of PROGRAM_x
195 * \param index register number
196 * \param component src component (X=0, Y=1, Z=2, W=3)
197 * \param nr not used?!?
198 * \param neg negate value?
199 * \param abs take absolute value?
200 */
201 static struct brw_reg
202 get_reg(struct brw_wm_compile *c, int file, int index, int component,
203 int nr, GLuint neg, GLuint abs)
204 {
205 struct brw_reg reg;
206 switch (file) {
207 case PROGRAM_STATE_VAR:
208 case PROGRAM_CONSTANT:
209 case PROGRAM_UNIFORM:
210 file = PROGRAM_STATE_VAR;
211 break;
212 case PROGRAM_UNDEFINED:
213 return brw_null_reg();
214 case PROGRAM_TEMPORARY:
215 case PROGRAM_INPUT:
216 case PROGRAM_OUTPUT:
217 case PROGRAM_PAYLOAD:
218 break;
219 default:
220 _mesa_problem(NULL, "Unexpected file in get_reg()");
221 return brw_null_reg();
222 }
223
224 assert(index < 256);
225 assert(component < 4);
226
227 /* see if we've already allocated a HW register for this Mesa register */
228 if (c->wm_regs[file][index][component].inited) {
229 /* yes, re-use */
230 reg = c->wm_regs[file][index][component].reg;
231 }
232 else {
233 /* no, allocate new register */
234 int grf = alloc_grf(c);
235 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
236 if (grf < 0) {
237 /* totally out of temps */
238 grf = 51; /* XXX random register! */
239 }
240
241 reg = brw_vec8_grf(grf, 0);
242 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
243
244 set_reg(c, file, index, component, reg);
245 }
246
247 if (neg & (1 << component)) {
248 reg = negate(reg);
249 }
250 if (abs)
251 reg = brw_abs(reg);
252 return reg;
253 }
254
255
256
257 /**
258 * This is called if we run out of GRF registers. Examine the live intervals
259 * of temp regs in the program and free those which won't be used again.
260 */
261 static void
262 reclaim_temps(struct brw_wm_compile *c)
263 {
264 GLint intBegin[MAX_PROGRAM_TEMPS];
265 GLint intEnd[MAX_PROGRAM_TEMPS];
266 int index;
267
268 /*printf("Reclaim temps:\n");*/
269
270 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
271 intBegin, intEnd);
272
273 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
274 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
275 /* program temp[i] can be freed */
276 int component;
277 /*printf(" temp[%d] is dead\n", index);*/
278 for (component = 0; component < 4; component++) {
279 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
280 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
281 release_grf(c, r);
282 /*
283 printf(" Reclaim temp %d, reg %d at inst %d\n",
284 index, r, c->cur_inst);
285 */
286 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
287 }
288 }
289 }
290 }
291 }
292
293
294
295
296 /**
297 * Preallocate registers. This sets up the Mesa to hardware register
298 * mapping for certain registers, such as constants (uniforms/state vars)
299 * and shader inputs.
300 */
301 static void prealloc_reg(struct brw_wm_compile *c)
302 {
303 int i, j;
304 struct brw_reg reg;
305 int urb_read_length = 0;
306 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
307 GLuint reg_index = 0;
308
309 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
310 c->first_free_grf = 0;
311
312 for (i = 0; i < 4; i++) {
313 if (i < c->key.nr_depth_regs)
314 reg = brw_vec8_grf(i * 2, 0);
315 else
316 reg = brw_vec8_grf(0, 0);
317 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
318 }
319 reg_index += 2 * c->key.nr_depth_regs;
320
321 /* constants */
322 {
323 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
324 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
325
326 /* use a real constant buffer, or just use a section of the GRF? */
327 /* XXX this heuristic may need adjustment... */
328 if ((nr_params + nr_temps) * 4 + reg_index > 80)
329 c->fp->use_const_buffer = GL_TRUE;
330 else
331 c->fp->use_const_buffer = GL_FALSE;
332 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
333
334 if (c->fp->use_const_buffer) {
335 /* We'll use a real constant buffer and fetch constants from
336 * it with a dataport read message.
337 */
338
339 /* number of float constants in CURBE */
340 c->prog_data.nr_params = 0;
341 }
342 else {
343 const struct gl_program_parameter_list *plist =
344 c->fp->program.Base.Parameters;
345 int index = 0;
346
347 /* number of float constants in CURBE */
348 c->prog_data.nr_params = 4 * nr_params;
349
350 /* loop over program constants (float[4]) */
351 for (i = 0; i < nr_params; i++) {
352 /* loop over XYZW channels */
353 for (j = 0; j < 4; j++, index++) {
354 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
355 /* Save pointer to parameter/constant value.
356 * Constants will be copied in prepare_constant_buffer()
357 */
358 c->prog_data.param[index] = &plist->ParameterValues[i][j];
359 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
360 }
361 }
362 /* number of constant regs used (each reg is float[8]) */
363 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
364 reg_index += c->nr_creg;
365 }
366 }
367
368 /* fragment shader inputs */
369 for (i = 0; i < VERT_RESULT_MAX; i++) {
370 int fp_input;
371
372 if (i >= VERT_RESULT_VAR0)
373 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
374 else if (i <= VERT_RESULT_TEX7)
375 fp_input = i;
376 else
377 fp_input = -1;
378
379 if (fp_input >= 0 && inputs & (1 << fp_input)) {
380 urb_read_length = reg_index;
381 reg = brw_vec8_grf(reg_index, 0);
382 for (j = 0; j < 4; j++)
383 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
384 }
385 if (c->key.vp_outputs_written & (1 << i)) {
386 reg_index += 2;
387 }
388 }
389
390 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
391 c->prog_data.urb_read_length = urb_read_length;
392 c->prog_data.curb_read_length = c->nr_creg;
393 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
394 reg_index++;
395 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
396 reg_index += 2;
397
398 /* mark GRF regs [0..reg_index-1] as in-use */
399 for (i = 0; i < reg_index; i++)
400 prealloc_grf(c, i);
401
402 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
403 prealloc_grf(c, 126);
404 prealloc_grf(c, 127);
405
406 /* An instruction may reference up to three constants.
407 * They'll be found in these registers.
408 * XXX alloc these on demand!
409 */
410 if (c->fp->use_const_buffer) {
411 for (i = 0; i < 3; i++) {
412 c->current_const[i].index = -1;
413 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
414 }
415 }
416 #if 0
417 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
418 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
419 #endif
420 }
421
422
423 /**
424 * Check if any of the instruction's src registers are constants, uniforms,
425 * or statevars. If so, fetch any constants that we don't already have in
426 * the three GRF slots.
427 */
428 static void fetch_constants(struct brw_wm_compile *c,
429 const struct prog_instruction *inst)
430 {
431 struct brw_compile *p = &c->func;
432 GLuint i;
433
434 /* loop over instruction src regs */
435 for (i = 0; i < 3; i++) {
436 const struct prog_src_register *src = &inst->SrcReg[i];
437 if (src->File == PROGRAM_STATE_VAR ||
438 src->File == PROGRAM_CONSTANT ||
439 src->File == PROGRAM_UNIFORM) {
440 c->current_const[i].index = src->Index;
441
442 #if 0
443 printf(" fetch const[%d] for arg %d into reg %d\n",
444 src->Index, i, c->current_const[i].reg.nr);
445 #endif
446
447 /* need to fetch the constant now */
448 brw_dp_READ_4(p,
449 c->current_const[i].reg, /* writeback dest */
450 src->RelAddr, /* relative indexing? */
451 16 * src->Index, /* byte offset */
452 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
453 );
454 }
455 }
456 }
457
458
459 /**
460 * Convert Mesa dst register to brw register.
461 */
462 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
463 const struct prog_instruction *inst,
464 GLuint component)
465 {
466 const int nr = 1;
467 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
468 0, 0);
469 }
470
471
472 static struct brw_reg
473 get_src_reg_const(struct brw_wm_compile *c,
474 const struct prog_instruction *inst,
475 GLuint srcRegIndex, GLuint component)
476 {
477 /* We should have already fetched the constant from the constant
478 * buffer in fetch_constants(). Now we just have to return a
479 * register description that extracts the needed component and
480 * smears it across all eight vector components.
481 */
482 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
483 struct brw_reg const_reg;
484
485 assert(component < 4);
486 assert(srcRegIndex < 3);
487 assert(c->current_const[srcRegIndex].index != -1);
488 const_reg = c->current_const[srcRegIndex].reg;
489
490 /* extract desired float from the const_reg, and smear */
491 const_reg = stride(const_reg, 0, 1, 0);
492 const_reg.subnr = component * 4;
493
494 if (src->Negate & (1 << component))
495 const_reg = negate(const_reg);
496 if (src->Abs)
497 const_reg = brw_abs(const_reg);
498
499 #if 0
500 printf(" form const[%d].%d for arg %d, reg %d\n",
501 c->current_const[srcRegIndex].index,
502 component,
503 srcRegIndex,
504 const_reg.nr);
505 #endif
506
507 return const_reg;
508 }
509
510
511 /**
512 * Convert Mesa src register to brw register.
513 */
514 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
515 const struct prog_instruction *inst,
516 GLuint srcRegIndex, GLuint channel)
517 {
518 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
519 const GLuint nr = 1;
520 const GLuint component = GET_SWZ(src->Swizzle, channel);
521
522 /* Extended swizzle terms */
523 if (component == SWIZZLE_ZERO) {
524 return brw_imm_f(0.0F);
525 }
526 else if (component == SWIZZLE_ONE) {
527 return brw_imm_f(1.0F);
528 }
529
530 if (c->fp->use_const_buffer &&
531 (src->File == PROGRAM_STATE_VAR ||
532 src->File == PROGRAM_CONSTANT ||
533 src->File == PROGRAM_UNIFORM)) {
534 return get_src_reg_const(c, inst, srcRegIndex, component);
535 }
536 else {
537 /* other type of source register */
538 return get_reg(c, src->File, src->Index, component, nr,
539 src->Negate, src->Abs);
540 }
541 }
542
543
544 /**
545 * Same as \sa get_src_reg() but if the register is a literal, emit
546 * a brw_reg encoding the literal.
547 * Note that a brw instruction only allows one src operand to be a literal.
548 * For instructions with more than one operand, only the second can be a
549 * literal. This means that we treat some literals as constants/uniforms
550 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
551 *
552 */
553 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
554 const struct prog_instruction *inst,
555 GLuint srcRegIndex, GLuint channel)
556 {
557 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
558 if (src->File == PROGRAM_CONSTANT) {
559 /* a literal */
560 const int component = GET_SWZ(src->Swizzle, channel);
561 const GLfloat *param =
562 c->fp->program.Base.Parameters->ParameterValues[src->Index];
563 GLfloat value = param[component];
564 if (src->Negate & (1 << channel))
565 value = -value;
566 if (src->Abs)
567 value = FABSF(value);
568 #if 0
569 printf(" form immed value %f for chan %d\n", value, channel);
570 #endif
571 return brw_imm_f(value);
572 }
573 else {
574 return get_src_reg(c, inst, srcRegIndex, channel);
575 }
576 }
577
578
579 /**
580 * Subroutines are minimal support for resusable instruction sequences.
581 * They are implemented as simply as possible to minimise overhead: there
582 * is no explicit support for communication between the caller and callee
583 * other than saving the return address in a temporary register, nor is
584 * there any automatic local storage. This implies that great care is
585 * required before attempting reentrancy or any kind of nested
586 * subroutine invocations.
587 */
588 static void invoke_subroutine( struct brw_wm_compile *c,
589 enum _subroutine subroutine,
590 void (*emit)( struct brw_wm_compile * ) )
591 {
592 struct brw_compile *p = &c->func;
593
594 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
595
596 if( c->subroutines[ subroutine ] ) {
597 /* subroutine previously emitted: reuse existing instructions */
598
599 int mark = mark_tmps( c );
600 struct brw_reg return_address = retype( alloc_tmp( c ),
601 BRW_REGISTER_TYPE_UD );
602 int here = p->nr_insn;
603
604 brw_push_insn_state(p);
605 brw_set_mask_control(p, BRW_MASK_DISABLE);
606 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
607
608 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
609 brw_imm_d( ( c->subroutines[ subroutine ] -
610 here - 1 ) << 4 ) );
611 brw_pop_insn_state(p);
612
613 release_tmps( c, mark );
614 } else {
615 /* previously unused subroutine: emit, and mark for later reuse */
616
617 int mark = mark_tmps( c );
618 struct brw_reg return_address = retype( alloc_tmp( c ),
619 BRW_REGISTER_TYPE_UD );
620 struct brw_instruction *calc;
621 int base = p->nr_insn;
622
623 brw_push_insn_state(p);
624 brw_set_mask_control(p, BRW_MASK_DISABLE);
625 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
626 brw_pop_insn_state(p);
627
628 c->subroutines[ subroutine ] = p->nr_insn;
629
630 emit( c );
631
632 brw_push_insn_state(p);
633 brw_set_mask_control(p, BRW_MASK_DISABLE);
634 brw_MOV( p, brw_ip_reg(), return_address );
635 brw_pop_insn_state(p);
636
637 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
638
639 release_tmps( c, mark );
640 }
641 }
642
643 static void emit_trunc( struct brw_wm_compile *c,
644 const struct prog_instruction *inst)
645 {
646 int i;
647 struct brw_compile *p = &c->func;
648 GLuint mask = inst->DstReg.WriteMask;
649 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
650 for (i = 0; i < 4; i++) {
651 if (mask & (1<<i)) {
652 struct brw_reg src, dst;
653 dst = get_dst_reg(c, inst, i);
654 src = get_src_reg(c, inst, 0, i);
655 brw_RNDZ(p, dst, src);
656 }
657 }
658 brw_set_saturate(p, 0);
659 }
660
661 static void emit_mov( struct brw_wm_compile *c,
662 const struct prog_instruction *inst)
663 {
664 int i;
665 struct brw_compile *p = &c->func;
666 GLuint mask = inst->DstReg.WriteMask;
667 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
668 for (i = 0; i < 4; i++) {
669 if (mask & (1<<i)) {
670 struct brw_reg src, dst;
671 dst = get_dst_reg(c, inst, i);
672 /* XXX some moves from immediate value don't work reliably!!! */
673 /*src = get_src_reg_imm(c, inst, 0, i);*/
674 src = get_src_reg(c, inst, 0, i);
675 brw_MOV(p, dst, src);
676 }
677 }
678 brw_set_saturate(p, 0);
679 }
680
681 static void emit_pixel_xy(struct brw_wm_compile *c,
682 const struct prog_instruction *inst)
683 {
684 struct brw_reg r1 = brw_vec1_grf(1, 0);
685 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
686
687 struct brw_reg dst0, dst1;
688 struct brw_compile *p = &c->func;
689 GLuint mask = inst->DstReg.WriteMask;
690
691 dst0 = get_dst_reg(c, inst, 0);
692 dst1 = get_dst_reg(c, inst, 1);
693 /* Calculate pixel centers by adding 1 or 0 to each of the
694 * micro-tile coordinates passed in r1.
695 */
696 if (mask & WRITEMASK_X) {
697 brw_ADD(p,
698 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
699 stride(suboffset(r1_uw, 4), 2, 4, 0),
700 brw_imm_v(0x10101010));
701 }
702
703 if (mask & WRITEMASK_Y) {
704 brw_ADD(p,
705 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
706 stride(suboffset(r1_uw, 5), 2, 4, 0),
707 brw_imm_v(0x11001100));
708 }
709 }
710
711 static void emit_delta_xy(struct brw_wm_compile *c,
712 const struct prog_instruction *inst)
713 {
714 struct brw_reg r1 = brw_vec1_grf(1, 0);
715 struct brw_reg dst0, dst1, src0, src1;
716 struct brw_compile *p = &c->func;
717 GLuint mask = inst->DstReg.WriteMask;
718
719 dst0 = get_dst_reg(c, inst, 0);
720 dst1 = get_dst_reg(c, inst, 1);
721 src0 = get_src_reg(c, inst, 0, 0);
722 src1 = get_src_reg(c, inst, 0, 1);
723 /* Calc delta X,Y by subtracting origin in r1 from the pixel
724 * centers.
725 */
726 if (mask & WRITEMASK_X) {
727 brw_ADD(p,
728 dst0,
729 retype(src0, BRW_REGISTER_TYPE_UW),
730 negate(r1));
731 }
732
733 if (mask & WRITEMASK_Y) {
734 brw_ADD(p,
735 dst1,
736 retype(src1, BRW_REGISTER_TYPE_UW),
737 negate(suboffset(r1,1)));
738
739 }
740 }
741
742 static void fire_fb_write( struct brw_wm_compile *c,
743 GLuint base_reg,
744 GLuint nr,
745 GLuint target,
746 GLuint eot)
747 {
748 struct brw_compile *p = &c->func;
749 /* Pass through control information:
750 */
751 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
752 {
753 brw_push_insn_state(p);
754 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
755 brw_MOV(p,
756 brw_message_reg(base_reg + 1),
757 brw_vec8_grf(1, 0));
758 brw_pop_insn_state(p);
759 }
760 /* Send framebuffer write message: */
761 brw_fb_WRITE(p,
762 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
763 base_reg,
764 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
765 target,
766 nr,
767 0,
768 eot);
769 }
770
771 static void emit_fb_write(struct brw_wm_compile *c,
772 const struct prog_instruction *inst)
773 {
774 struct brw_compile *p = &c->func;
775 int nr = 2;
776 int channel;
777 GLuint target, eot;
778 struct brw_reg src0;
779
780 /* Reserve a space for AA - may not be needed:
781 */
782 if (c->key.aa_dest_stencil_reg)
783 nr += 1;
784
785 brw_push_insn_state(p);
786 for (channel = 0; channel < 4; channel++) {
787 src0 = get_src_reg(c, inst, 0, channel);
788 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
789 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
790 brw_MOV(p, brw_message_reg(nr + channel), src0);
791 }
792 /* skip over the regs populated above: */
793 nr += 8;
794 brw_pop_insn_state(p);
795
796 if (c->key.source_depth_to_render_target) {
797 if (c->key.computes_depth) {
798 src0 = get_src_reg(c, inst, 2, 2);
799 brw_MOV(p, brw_message_reg(nr), src0);
800 }
801 else {
802 src0 = get_src_reg(c, inst, 1, 1);
803 brw_MOV(p, brw_message_reg(nr), src0);
804 }
805
806 nr += 2;
807 }
808
809 if (c->key.dest_depth_reg) {
810 const GLuint comp = c->key.dest_depth_reg / 2;
811 const GLuint off = c->key.dest_depth_reg % 2;
812
813 if (off != 0) {
814 /* XXX this code needs review/testing */
815 struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
816 struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
817
818 brw_push_insn_state(p);
819 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
820
821 brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
822 /* 2nd half? */
823 brw_MOV(p, brw_message_reg(nr+1), arg1_1);
824 brw_pop_insn_state(p);
825 }
826 else
827 {
828 struct brw_reg src = get_src_reg(c, inst, 1, 1);
829 brw_MOV(p, brw_message_reg(nr), src);
830 }
831 nr += 2;
832 }
833
834 target = inst->Aux >> 1;
835 eot = inst->Aux & 1;
836 fire_fb_write(c, 0, nr, target, eot);
837 }
838
839 static void emit_pixel_w( struct brw_wm_compile *c,
840 const struct prog_instruction *inst)
841 {
842 struct brw_compile *p = &c->func;
843 GLuint mask = inst->DstReg.WriteMask;
844 if (mask & WRITEMASK_W) {
845 struct brw_reg dst, src0, delta0, delta1;
846 struct brw_reg interp3;
847
848 dst = get_dst_reg(c, inst, 3);
849 src0 = get_src_reg(c, inst, 0, 0);
850 delta0 = get_src_reg(c, inst, 1, 0);
851 delta1 = get_src_reg(c, inst, 1, 1);
852
853 interp3 = brw_vec1_grf(src0.nr+1, 4);
854 /* Calc 1/w - just linterp wpos[3] optimized by putting the
855 * result straight into a message reg.
856 */
857 brw_LINE(p, brw_null_reg(), interp3, delta0);
858 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
859
860 /* Calc w */
861 brw_math_16( p, dst,
862 BRW_MATH_FUNCTION_INV,
863 BRW_MATH_SATURATE_NONE,
864 2, brw_null_reg(),
865 BRW_MATH_PRECISION_FULL);
866 }
867 }
868
869 static void emit_linterp(struct brw_wm_compile *c,
870 const struct prog_instruction *inst)
871 {
872 struct brw_compile *p = &c->func;
873 GLuint mask = inst->DstReg.WriteMask;
874 struct brw_reg interp[4];
875 struct brw_reg dst, delta0, delta1;
876 struct brw_reg src0;
877 GLuint nr, i;
878
879 src0 = get_src_reg(c, inst, 0, 0);
880 delta0 = get_src_reg(c, inst, 1, 0);
881 delta1 = get_src_reg(c, inst, 1, 1);
882 nr = src0.nr;
883
884 interp[0] = brw_vec1_grf(nr, 0);
885 interp[1] = brw_vec1_grf(nr, 4);
886 interp[2] = brw_vec1_grf(nr+1, 0);
887 interp[3] = brw_vec1_grf(nr+1, 4);
888
889 for(i = 0; i < 4; i++ ) {
890 if (mask & (1<<i)) {
891 dst = get_dst_reg(c, inst, i);
892 brw_LINE(p, brw_null_reg(), interp[i], delta0);
893 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
894 }
895 }
896 }
897
898 static void emit_cinterp(struct brw_wm_compile *c,
899 const struct prog_instruction *inst)
900 {
901 struct brw_compile *p = &c->func;
902 GLuint mask = inst->DstReg.WriteMask;
903
904 struct brw_reg interp[4];
905 struct brw_reg dst, src0;
906 GLuint nr, i;
907
908 src0 = get_src_reg(c, inst, 0, 0);
909 nr = src0.nr;
910
911 interp[0] = brw_vec1_grf(nr, 0);
912 interp[1] = brw_vec1_grf(nr, 4);
913 interp[2] = brw_vec1_grf(nr+1, 0);
914 interp[3] = brw_vec1_grf(nr+1, 4);
915
916 for(i = 0; i < 4; i++ ) {
917 if (mask & (1<<i)) {
918 dst = get_dst_reg(c, inst, i);
919 brw_MOV(p, dst, suboffset(interp[i],3));
920 }
921 }
922 }
923
924 static void emit_pinterp(struct brw_wm_compile *c,
925 const struct prog_instruction *inst)
926 {
927 struct brw_compile *p = &c->func;
928 GLuint mask = inst->DstReg.WriteMask;
929
930 struct brw_reg interp[4];
931 struct brw_reg dst, delta0, delta1;
932 struct brw_reg src0, w;
933 GLuint nr, i;
934
935 src0 = get_src_reg(c, inst, 0, 0);
936 delta0 = get_src_reg(c, inst, 1, 0);
937 delta1 = get_src_reg(c, inst, 1, 1);
938 w = get_src_reg(c, inst, 2, 3);
939 nr = src0.nr;
940
941 interp[0] = brw_vec1_grf(nr, 0);
942 interp[1] = brw_vec1_grf(nr, 4);
943 interp[2] = brw_vec1_grf(nr+1, 0);
944 interp[3] = brw_vec1_grf(nr+1, 4);
945
946 for(i = 0; i < 4; i++ ) {
947 if (mask & (1<<i)) {
948 dst = get_dst_reg(c, inst, i);
949 brw_LINE(p, brw_null_reg(), interp[i], delta0);
950 brw_MAC(p, dst, suboffset(interp[i],1),
951 delta1);
952 brw_MUL(p, dst, dst, w);
953 }
954 }
955 }
956
957 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
958 static void emit_frontfacing(struct brw_wm_compile *c,
959 const struct prog_instruction *inst)
960 {
961 struct brw_compile *p = &c->func;
962 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
963 struct brw_reg dst;
964 GLuint mask = inst->DstReg.WriteMask;
965 int i;
966
967 for (i = 0; i < 4; i++) {
968 if (mask & (1<<i)) {
969 dst = get_dst_reg(c, inst, i);
970 brw_MOV(p, dst, brw_imm_f(0.0));
971 }
972 }
973
974 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
975 * us front face
976 */
977 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
978 for (i = 0; i < 4; i++) {
979 if (mask & (1<<i)) {
980 dst = get_dst_reg(c, inst, i);
981 brw_MOV(p, dst, brw_imm_f(1.0));
982 }
983 }
984 brw_set_predicate_control_flag_value(p, 0xff);
985 }
986
987 static void emit_xpd(struct brw_wm_compile *c,
988 const struct prog_instruction *inst)
989 {
990 int i;
991 struct brw_compile *p = &c->func;
992 GLuint mask = inst->DstReg.WriteMask;
993 for (i = 0; i < 4; i++) {
994 GLuint i2 = (i+2)%3;
995 GLuint i1 = (i+1)%3;
996 if (mask & (1<<i)) {
997 struct brw_reg src0, src1, dst;
998 dst = get_dst_reg(c, inst, i);
999 src0 = negate(get_src_reg(c, inst, 0, i2));
1000 src1 = get_src_reg_imm(c, inst, 1, i1);
1001 brw_MUL(p, brw_null_reg(), src0, src1);
1002 src0 = get_src_reg(c, inst, 0, i1);
1003 src1 = get_src_reg_imm(c, inst, 1, i2);
1004 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1005 brw_MAC(p, dst, src0, src1);
1006 brw_set_saturate(p, 0);
1007 }
1008 }
1009 brw_set_saturate(p, 0);
1010 }
1011
1012 static void emit_dp3(struct brw_wm_compile *c,
1013 const struct prog_instruction *inst)
1014 {
1015 struct brw_reg src0[3], src1[3], dst;
1016 int i;
1017 struct brw_compile *p = &c->func;
1018 GLuint mask = inst->DstReg.WriteMask;
1019 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1020
1021 if (!(mask & WRITEMASK_XYZW))
1022 return;
1023
1024 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1025
1026 for (i = 0; i < 3; i++) {
1027 src0[i] = get_src_reg(c, inst, 0, i);
1028 src1[i] = get_src_reg_imm(c, inst, 1, i);
1029 }
1030
1031 dst = get_dst_reg(c, inst, dst_chan);
1032 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1033 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1034 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1035 brw_MAC(p, dst, src0[2], src1[2]);
1036 brw_set_saturate(p, 0);
1037 }
1038
1039 static void emit_dp4(struct brw_wm_compile *c,
1040 const struct prog_instruction *inst)
1041 {
1042 struct brw_reg src0[4], src1[4], dst;
1043 int i;
1044 struct brw_compile *p = &c->func;
1045 GLuint mask = inst->DstReg.WriteMask;
1046 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1047
1048 if (!(mask & WRITEMASK_XYZW))
1049 return;
1050
1051 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1052
1053 for (i = 0; i < 4; i++) {
1054 src0[i] = get_src_reg(c, inst, 0, i);
1055 src1[i] = get_src_reg_imm(c, inst, 1, i);
1056 }
1057 dst = get_dst_reg(c, inst, dst_chan);
1058 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1059 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1060 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1061 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1062 brw_MAC(p, dst, src0[3], src1[3]);
1063 brw_set_saturate(p, 0);
1064 }
1065
1066 static void emit_dph(struct brw_wm_compile *c,
1067 const struct prog_instruction *inst)
1068 {
1069 struct brw_reg src0[4], src1[4], dst;
1070 int i;
1071 struct brw_compile *p = &c->func;
1072 GLuint mask = inst->DstReg.WriteMask;
1073 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1074
1075 if (!(mask & WRITEMASK_XYZW))
1076 return;
1077
1078 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1079
1080 for (i = 0; i < 4; i++) {
1081 src0[i] = get_src_reg(c, inst, 0, i);
1082 src1[i] = get_src_reg_imm(c, inst, 1, i);
1083 }
1084 dst = get_dst_reg(c, inst, dst_chan);
1085 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1086 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1087 brw_MAC(p, dst, src0[2], src1[2]);
1088 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1089 brw_ADD(p, dst, dst, src1[3]);
1090 brw_set_saturate(p, 0);
1091 }
1092
1093 /**
1094 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1095 * Note that the result of the function is smeared across the dest
1096 * register's X, Y, Z and W channels (subject to writemasking of course).
1097 */
1098 static void emit_math1(struct brw_wm_compile *c,
1099 const struct prog_instruction *inst, GLuint func)
1100 {
1101 struct brw_compile *p = &c->func;
1102 struct brw_reg src0, dst;
1103 GLuint mask = inst->DstReg.WriteMask;
1104 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1105
1106 if (!(mask & WRITEMASK_XYZW))
1107 return;
1108
1109 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1110
1111 /* Get first component of source register */
1112 dst = get_dst_reg(c, inst, dst_chan);
1113 src0 = get_src_reg(c, inst, 0, 0);
1114
1115 brw_MOV(p, brw_message_reg(2), src0);
1116 brw_math(p,
1117 dst,
1118 func,
1119 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1120 2,
1121 brw_null_reg(),
1122 BRW_MATH_DATA_VECTOR,
1123 BRW_MATH_PRECISION_FULL);
1124 }
1125
1126 static void emit_rcp(struct brw_wm_compile *c,
1127 const struct prog_instruction *inst)
1128 {
1129 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1130 }
1131
1132 static void emit_rsq(struct brw_wm_compile *c,
1133 const struct prog_instruction *inst)
1134 {
1135 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1136 }
1137
1138 static void emit_sin(struct brw_wm_compile *c,
1139 const struct prog_instruction *inst)
1140 {
1141 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1142 }
1143
1144 static void emit_cos(struct brw_wm_compile *c,
1145 const struct prog_instruction *inst)
1146 {
1147 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1148 }
1149
1150 static void emit_ex2(struct brw_wm_compile *c,
1151 const struct prog_instruction *inst)
1152 {
1153 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1154 }
1155
1156 static void emit_lg2(struct brw_wm_compile *c,
1157 const struct prog_instruction *inst)
1158 {
1159 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1160 }
1161
1162 static void emit_add(struct brw_wm_compile *c,
1163 const struct prog_instruction *inst)
1164 {
1165 struct brw_compile *p = &c->func;
1166 struct brw_reg src0, src1, dst;
1167 GLuint mask = inst->DstReg.WriteMask;
1168 int i;
1169 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1170 for (i = 0 ; i < 4; i++) {
1171 if (mask & (1<<i)) {
1172 dst = get_dst_reg(c, inst, i);
1173 src0 = get_src_reg(c, inst, 0, i);
1174 src1 = get_src_reg_imm(c, inst, 1, i);
1175 brw_ADD(p, dst, src0, src1);
1176 }
1177 }
1178 brw_set_saturate(p, 0);
1179 }
1180
1181 static void emit_arl(struct brw_wm_compile *c,
1182 const struct prog_instruction *inst)
1183 {
1184 struct brw_compile *p = &c->func;
1185 struct brw_reg src0, addr_reg;
1186 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1187 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1188 BRW_ARF_ADDRESS, 0);
1189 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1190 brw_MOV(p, addr_reg, src0);
1191 brw_set_saturate(p, 0);
1192 }
1193
1194
1195 static void emit_mul(struct brw_wm_compile *c,
1196 const struct prog_instruction *inst)
1197 {
1198 struct brw_compile *p = &c->func;
1199 struct brw_reg src0, src1, dst;
1200 GLuint mask = inst->DstReg.WriteMask;
1201 int i;
1202 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1203 for (i = 0 ; i < 4; i++) {
1204 if (mask & (1<<i)) {
1205 dst = get_dst_reg(c, inst, i);
1206 src0 = get_src_reg(c, inst, 0, i);
1207 src1 = get_src_reg_imm(c, inst, 1, i);
1208 brw_MUL(p, dst, src0, src1);
1209 }
1210 }
1211 brw_set_saturate(p, 0);
1212 }
1213
1214 static void emit_frc(struct brw_wm_compile *c,
1215 const struct prog_instruction *inst)
1216 {
1217 struct brw_compile *p = &c->func;
1218 struct brw_reg src0, dst;
1219 GLuint mask = inst->DstReg.WriteMask;
1220 int i;
1221 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1222 for (i = 0 ; i < 4; i++) {
1223 if (mask & (1<<i)) {
1224 dst = get_dst_reg(c, inst, i);
1225 src0 = get_src_reg_imm(c, inst, 0, i);
1226 brw_FRC(p, dst, src0);
1227 }
1228 }
1229 if (inst->SaturateMode != SATURATE_OFF)
1230 brw_set_saturate(p, 0);
1231 }
1232
1233 static void emit_flr(struct brw_wm_compile *c,
1234 const struct prog_instruction *inst)
1235 {
1236 struct brw_compile *p = &c->func;
1237 struct brw_reg src0, dst;
1238 GLuint mask = inst->DstReg.WriteMask;
1239 int i;
1240 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1241 for (i = 0 ; i < 4; i++) {
1242 if (mask & (1<<i)) {
1243 dst = get_dst_reg(c, inst, i);
1244 src0 = get_src_reg_imm(c, inst, 0, i);
1245 brw_RNDD(p, dst, src0);
1246 }
1247 }
1248 brw_set_saturate(p, 0);
1249 }
1250
1251
1252 static void emit_min_max(struct brw_wm_compile *c,
1253 const struct prog_instruction *inst)
1254 {
1255 struct brw_compile *p = &c->func;
1256 const GLuint mask = inst->DstReg.WriteMask;
1257 const int mark = mark_tmps(c);
1258 int i;
1259 brw_push_insn_state(p);
1260 for (i = 0; i < 4; i++) {
1261 if (mask & (1<<i)) {
1262 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1263 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1264 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1265 struct brw_reg dst;
1266 /* if dst==src0 or dst==src1 we need to use a temp reg */
1267 GLboolean use_temp = brw_same_reg(dst, src0) ||
1268 brw_same_reg(dst, src1);
1269 if (use_temp)
1270 dst = alloc_tmp(c);
1271 else
1272 dst = real_dst;
1273
1274 /*
1275 printf(" Min/max: dst %d src0 %d src1 %d\n",
1276 dst.nr, src0.nr, src1.nr);
1277 */
1278 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1279 brw_MOV(p, dst, src0);
1280 brw_set_saturate(p, 0);
1281
1282 if (inst->Opcode == OPCODE_MIN)
1283 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1284 else
1285 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1286
1287 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1288 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1289 brw_MOV(p, dst, src1);
1290 brw_set_saturate(p, 0);
1291 brw_set_predicate_control_flag_value(p, 0xff);
1292 if (use_temp)
1293 brw_MOV(p, real_dst, dst);
1294 }
1295 }
1296 brw_pop_insn_state(p);
1297 release_tmps(c, mark);
1298 }
1299
1300 static void emit_pow(struct brw_wm_compile *c,
1301 const struct prog_instruction *inst)
1302 {
1303 struct brw_compile *p = &c->func;
1304 struct brw_reg dst, src0, src1;
1305 GLuint mask = inst->DstReg.WriteMask;
1306 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1307
1308 if (!(mask & WRITEMASK_XYZW))
1309 return;
1310
1311 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1312
1313 dst = get_dst_reg(c, inst, dst_chan);
1314 src0 = get_src_reg_imm(c, inst, 0, 0);
1315 src1 = get_src_reg_imm(c, inst, 1, 0);
1316
1317 brw_MOV(p, brw_message_reg(2), src0);
1318 brw_MOV(p, brw_message_reg(3), src1);
1319
1320 brw_math(p,
1321 dst,
1322 BRW_MATH_FUNCTION_POW,
1323 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1324 2,
1325 brw_null_reg(),
1326 BRW_MATH_DATA_VECTOR,
1327 BRW_MATH_PRECISION_FULL);
1328 }
1329
1330 static void emit_lrp(struct brw_wm_compile *c,
1331 const struct prog_instruction *inst)
1332 {
1333 struct brw_compile *p = &c->func;
1334 GLuint mask = inst->DstReg.WriteMask;
1335 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1336 int i;
1337 int mark = mark_tmps(c);
1338 for (i = 0; i < 4; i++) {
1339 if (mask & (1<<i)) {
1340 dst = get_dst_reg(c, inst, i);
1341 src0 = get_src_reg(c, inst, 0, i);
1342
1343 src1 = get_src_reg_imm(c, inst, 1, i);
1344
1345 if (src1.nr == dst.nr) {
1346 tmp1 = alloc_tmp(c);
1347 brw_MOV(p, tmp1, src1);
1348 } else
1349 tmp1 = src1;
1350
1351 src2 = get_src_reg(c, inst, 2, i);
1352 if (src2.nr == dst.nr) {
1353 tmp2 = alloc_tmp(c);
1354 brw_MOV(p, tmp2, src2);
1355 } else
1356 tmp2 = src2;
1357
1358 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1359 brw_MUL(p, brw_null_reg(), dst, tmp2);
1360 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1361 brw_MAC(p, dst, src0, tmp1);
1362 brw_set_saturate(p, 0);
1363 }
1364 release_tmps(c, mark);
1365 }
1366 }
1367
1368 /**
1369 * For GLSL shaders, this KIL will be unconditional.
1370 * It may be contained inside an IF/ENDIF structure of course.
1371 */
1372 static void emit_kil(struct brw_wm_compile *c)
1373 {
1374 struct brw_compile *p = &c->func;
1375 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1376 brw_push_insn_state(p);
1377 brw_set_mask_control(p, BRW_MASK_DISABLE);
1378 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1379 brw_AND(p, depth, c->emit_mask_reg, depth);
1380 brw_pop_insn_state(p);
1381 }
1382
1383 static void emit_mad(struct brw_wm_compile *c,
1384 const struct prog_instruction *inst)
1385 {
1386 struct brw_compile *p = &c->func;
1387 GLuint mask = inst->DstReg.WriteMask;
1388 struct brw_reg dst, src0, src1, src2;
1389 int i;
1390
1391 for (i = 0; i < 4; i++) {
1392 if (mask & (1<<i)) {
1393 dst = get_dst_reg(c, inst, i);
1394 src0 = get_src_reg(c, inst, 0, i);
1395 src1 = get_src_reg_imm(c, inst, 1, i);
1396 src2 = get_src_reg_imm(c, inst, 2, i);
1397 brw_MUL(p, dst, src0, src1);
1398
1399 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1400 brw_ADD(p, dst, dst, src2);
1401 brw_set_saturate(p, 0);
1402 }
1403 }
1404 }
1405
1406 static void emit_sop(struct brw_wm_compile *c,
1407 const struct prog_instruction *inst, GLuint cond)
1408 {
1409 struct brw_compile *p = &c->func;
1410 GLuint mask = inst->DstReg.WriteMask;
1411 struct brw_reg dst, src0, src1;
1412 int i;
1413
1414 for (i = 0; i < 4; i++) {
1415 if (mask & (1<<i)) {
1416 dst = get_dst_reg(c, inst, i);
1417 src0 = get_src_reg(c, inst, 0, i);
1418 src1 = get_src_reg_imm(c, inst, 1, i);
1419 brw_push_insn_state(p);
1420 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1421 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1422 brw_MOV(p, dst, brw_imm_f(0.0));
1423 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1424 brw_MOV(p, dst, brw_imm_f(1.0));
1425 brw_pop_insn_state(p);
1426 }
1427 }
1428 }
1429
1430 static void emit_slt(struct brw_wm_compile *c,
1431 const struct prog_instruction *inst)
1432 {
1433 emit_sop(c, inst, BRW_CONDITIONAL_L);
1434 }
1435
1436 static void emit_sle(struct brw_wm_compile *c,
1437 const struct prog_instruction *inst)
1438 {
1439 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1440 }
1441
1442 static void emit_sgt(struct brw_wm_compile *c,
1443 const struct prog_instruction *inst)
1444 {
1445 emit_sop(c, inst, BRW_CONDITIONAL_G);
1446 }
1447
1448 static void emit_sge(struct brw_wm_compile *c,
1449 const struct prog_instruction *inst)
1450 {
1451 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1452 }
1453
1454 static void emit_seq(struct brw_wm_compile *c,
1455 const struct prog_instruction *inst)
1456 {
1457 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1458 }
1459
1460 static void emit_sne(struct brw_wm_compile *c,
1461 const struct prog_instruction *inst)
1462 {
1463 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1464 }
1465
1466 static void emit_ddx(struct brw_wm_compile *c,
1467 const struct prog_instruction *inst)
1468 {
1469 struct brw_compile *p = &c->func;
1470 GLuint mask = inst->DstReg.WriteMask;
1471 struct brw_reg interp[4];
1472 struct brw_reg dst;
1473 struct brw_reg src0, w;
1474 GLuint nr, i;
1475 src0 = get_src_reg(c, inst, 0, 0);
1476 w = get_src_reg(c, inst, 1, 3);
1477 nr = src0.nr;
1478 interp[0] = brw_vec1_grf(nr, 0);
1479 interp[1] = brw_vec1_grf(nr, 4);
1480 interp[2] = brw_vec1_grf(nr+1, 0);
1481 interp[3] = brw_vec1_grf(nr+1, 4);
1482 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1483 for(i = 0; i < 4; i++ ) {
1484 if (mask & (1<<i)) {
1485 dst = get_dst_reg(c, inst, i);
1486 brw_MOV(p, dst, interp[i]);
1487 brw_MUL(p, dst, dst, w);
1488 }
1489 }
1490 brw_set_saturate(p, 0);
1491 }
1492
1493 static void emit_ddy(struct brw_wm_compile *c,
1494 const struct prog_instruction *inst)
1495 {
1496 struct brw_compile *p = &c->func;
1497 GLuint mask = inst->DstReg.WriteMask;
1498 struct brw_reg interp[4];
1499 struct brw_reg dst;
1500 struct brw_reg src0, w;
1501 GLuint nr, i;
1502
1503 src0 = get_src_reg(c, inst, 0, 0);
1504 nr = src0.nr;
1505 w = get_src_reg(c, inst, 1, 3);
1506 interp[0] = brw_vec1_grf(nr, 0);
1507 interp[1] = brw_vec1_grf(nr, 4);
1508 interp[2] = brw_vec1_grf(nr+1, 0);
1509 interp[3] = brw_vec1_grf(nr+1, 4);
1510 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1511 for(i = 0; i < 4; i++ ) {
1512 if (mask & (1<<i)) {
1513 dst = get_dst_reg(c, inst, i);
1514 brw_MOV(p, dst, suboffset(interp[i], 1));
1515 brw_MUL(p, dst, dst, w);
1516 }
1517 }
1518 brw_set_saturate(p, 0);
1519 }
1520
1521 static INLINE struct brw_reg high_words( struct brw_reg reg )
1522 {
1523 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1524 0, 8, 2 );
1525 }
1526
1527 static INLINE struct brw_reg low_words( struct brw_reg reg )
1528 {
1529 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1530 }
1531
1532 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1533 {
1534 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1535 }
1536
1537 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1538 {
1539 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1540 0, 16, 2 );
1541 }
1542
1543 /* One-, two- and three-dimensional Perlin noise, similar to the description
1544 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1545 static void noise1_sub( struct brw_wm_compile *c ) {
1546
1547 struct brw_compile *p = &c->func;
1548 struct brw_reg param,
1549 x0, x1, /* gradients at each end */
1550 t, tmp[ 2 ], /* float temporaries */
1551 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1552 int i;
1553 int mark = mark_tmps( c );
1554
1555 x0 = alloc_tmp( c );
1556 x1 = alloc_tmp( c );
1557 t = alloc_tmp( c );
1558 tmp[ 0 ] = alloc_tmp( c );
1559 tmp[ 1 ] = alloc_tmp( c );
1560 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1561 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1562 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1563 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1564 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1565
1566 param = lookup_tmp( c, mark - 2 );
1567
1568 brw_set_access_mode( p, BRW_ALIGN_1 );
1569
1570 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1571
1572 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1573 be hashed. Also compute the remainder (offset within the unit
1574 length), interleaved to reduce register dependency penalties. */
1575 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1576 brw_FRC( p, param, param );
1577 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1578 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1579 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1580
1581 /* We're now ready to perform the hashing. The two hashes are
1582 interleaved for performance. The hash function used is
1583 designed to rapidly achieve avalanche and require only 32x16
1584 bit multiplication, and 16-bit swizzles (which we get for
1585 free). We can't use immediate operands in the multiplies,
1586 because immediates are permitted only in src1 and the 16-bit
1587 factor is permitted only in src0. */
1588 for( i = 0; i < 2; i++ )
1589 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1590 for( i = 0; i < 2; i++ )
1591 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1592 high_words( itmp[ i ] ) );
1593 for( i = 0; i < 2; i++ )
1594 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1595 for( i = 0; i < 2; i++ )
1596 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1597 high_words( itmp[ i ] ) );
1598 for( i = 0; i < 2; i++ )
1599 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1600 for( i = 0; i < 2; i++ )
1601 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1602 high_words( itmp[ i ] ) );
1603
1604 /* Now we want to initialise the two gradients based on the
1605 hashes. Format conversion from signed integer to float leaves
1606 everything scaled too high by a factor of pow( 2, 31 ), but
1607 we correct for that right at the end. */
1608 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1609 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1610 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1611
1612 brw_MUL( p, x0, x0, param );
1613 brw_MUL( p, x1, x1, t );
1614
1615 /* We interpolate between the gradients using the polynomial
1616 6t^5 - 15t^4 + 10t^3 (Perlin). */
1617 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1618 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1619 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1620 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1621 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1622 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1623 pipeline */
1624 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1625 brw_MUL( p, param, tmp[ 0 ], param );
1626 brw_MUL( p, x1, x1, param );
1627 brw_ADD( p, x0, x0, x1 );
1628 /* scale by pow( 2, -30 ), to compensate for the format conversion
1629 above and an extra factor of 2 so that a single gradient covers
1630 the [-1,1] range */
1631 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1632
1633 release_tmps( c, mark );
1634 }
1635
1636 static void emit_noise1( struct brw_wm_compile *c,
1637 const struct prog_instruction *inst )
1638 {
1639 struct brw_compile *p = &c->func;
1640 struct brw_reg src, param, dst;
1641 GLuint mask = inst->DstReg.WriteMask;
1642 int i;
1643 int mark = mark_tmps( c );
1644
1645 assert( mark == 0 );
1646
1647 src = get_src_reg( c, inst, 0, 0 );
1648
1649 param = alloc_tmp( c );
1650
1651 brw_MOV( p, param, src );
1652
1653 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1654
1655 /* Fill in the result: */
1656 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1657 for (i = 0 ; i < 4; i++) {
1658 if (mask & (1<<i)) {
1659 dst = get_dst_reg(c, inst, i);
1660 brw_MOV( p, dst, param );
1661 }
1662 }
1663 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1664 brw_set_saturate( p, 0 );
1665
1666 release_tmps( c, mark );
1667 }
1668
1669 static void noise2_sub( struct brw_wm_compile *c ) {
1670
1671 struct brw_compile *p = &c->func;
1672 struct brw_reg param0, param1,
1673 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1674 t, tmp[ 4 ], /* float temporaries */
1675 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1676 int i;
1677 int mark = mark_tmps( c );
1678
1679 x0y0 = alloc_tmp( c );
1680 x0y1 = alloc_tmp( c );
1681 x1y0 = alloc_tmp( c );
1682 x1y1 = alloc_tmp( c );
1683 t = alloc_tmp( c );
1684 for( i = 0; i < 4; i++ ) {
1685 tmp[ i ] = alloc_tmp( c );
1686 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1687 }
1688 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1689 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1690 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1691
1692 param0 = lookup_tmp( c, mark - 3 );
1693 param1 = lookup_tmp( c, mark - 2 );
1694
1695 brw_set_access_mode( p, BRW_ALIGN_1 );
1696
1697 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1698 be hashed. Also compute the remainders (offsets within the unit
1699 square), interleaved to reduce register dependency penalties. */
1700 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1701 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1702 brw_FRC( p, param0, param0 );
1703 brw_FRC( p, param1, param1 );
1704 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1705 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1706 low_words( itmp[ 1 ] ) );
1707 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1708 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1709 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1710 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1711 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1712
1713 /* We're now ready to perform the hashing. The four hashes are
1714 interleaved for performance. The hash function used is
1715 designed to rapidly achieve avalanche and require only 32x16
1716 bit multiplication, and 16-bit swizzles (which we get for
1717 free). We can't use immediate operands in the multiplies,
1718 because immediates are permitted only in src1 and the 16-bit
1719 factor is permitted only in src0. */
1720 for( i = 0; i < 4; i++ )
1721 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1722 for( i = 0; i < 4; i++ )
1723 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1724 high_words( itmp[ i ] ) );
1725 for( i = 0; i < 4; i++ )
1726 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1727 for( i = 0; i < 4; i++ )
1728 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1729 high_words( itmp[ i ] ) );
1730 for( i = 0; i < 4; i++ )
1731 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1732 for( i = 0; i < 4; i++ )
1733 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1734 high_words( itmp[ i ] ) );
1735
1736 /* Now we want to initialise the four gradients based on the
1737 hashes. Format conversion from signed integer to float leaves
1738 everything scaled too high by a factor of pow( 2, 15 ), but
1739 we correct for that right at the end. */
1740 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1741 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1742 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1743 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1744 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1745
1746 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1747 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1748 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1749 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1750
1751 brw_MUL( p, x1y0, x1y0, t );
1752 brw_MUL( p, x1y1, x1y1, t );
1753 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1754 brw_MUL( p, x0y0, x0y0, param0 );
1755 brw_MUL( p, x0y1, x0y1, param0 );
1756
1757 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1758 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1759 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1760 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1761
1762 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1763 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1764 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1765 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1766
1767 /* We interpolate between the gradients using the polynomial
1768 6t^5 - 15t^4 + 10t^3 (Perlin). */
1769 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1770 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1771 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1772 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1773 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1774 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1775 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1776 pipeline */
1777 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1778 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1779 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1780 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1781 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1782 pipeline */
1783 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1784 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1785 brw_MUL( p, param0, tmp[ 0 ], param0 );
1786 brw_MUL( p, param1, tmp[ 1 ], param1 );
1787
1788 /* Here we interpolate in the y dimension... */
1789 brw_MUL( p, x0y1, x0y1, param1 );
1790 brw_MUL( p, x1y1, x1y1, param1 );
1791 brw_ADD( p, x0y0, x0y0, x0y1 );
1792 brw_ADD( p, x1y0, x1y0, x1y1 );
1793
1794 /* And now in x. There are horrible register dependencies here,
1795 but we have nothing else to do. */
1796 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1797 brw_MUL( p, x1y0, x1y0, param0 );
1798 brw_ADD( p, x0y0, x0y0, x1y0 );
1799
1800 /* scale by pow( 2, -15 ), as described above */
1801 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1802
1803 release_tmps( c, mark );
1804 }
1805
1806 static void emit_noise2( struct brw_wm_compile *c,
1807 const struct prog_instruction *inst )
1808 {
1809 struct brw_compile *p = &c->func;
1810 struct brw_reg src0, src1, param0, param1, dst;
1811 GLuint mask = inst->DstReg.WriteMask;
1812 int i;
1813 int mark = mark_tmps( c );
1814
1815 assert( mark == 0 );
1816
1817 src0 = get_src_reg( c, inst, 0, 0 );
1818 src1 = get_src_reg( c, inst, 0, 1 );
1819
1820 param0 = alloc_tmp( c );
1821 param1 = alloc_tmp( c );
1822
1823 brw_MOV( p, param0, src0 );
1824 brw_MOV( p, param1, src1 );
1825
1826 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1827
1828 /* Fill in the result: */
1829 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1830 for (i = 0 ; i < 4; i++) {
1831 if (mask & (1<<i)) {
1832 dst = get_dst_reg(c, inst, i);
1833 brw_MOV( p, dst, param0 );
1834 }
1835 }
1836 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1837 brw_set_saturate( p, 0 );
1838
1839 release_tmps( c, mark );
1840 }
1841
1842 /**
1843 * The three-dimensional case is much like the one- and two- versions above,
1844 * but since the number of corners is rapidly growing we now pack 16 16-bit
1845 * hashes into each register to extract more parallelism from the EUs.
1846 */
1847 static void noise3_sub( struct brw_wm_compile *c ) {
1848
1849 struct brw_compile *p = &c->func;
1850 struct brw_reg param0, param1, param2,
1851 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1852 xi, yi, zi, /* interpolation coefficients */
1853 t, tmp[ 8 ], /* float temporaries */
1854 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1855 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1856 int i;
1857 int mark = mark_tmps( c );
1858
1859 x0y0 = alloc_tmp( c );
1860 x0y1 = alloc_tmp( c );
1861 x1y0 = alloc_tmp( c );
1862 x1y1 = alloc_tmp( c );
1863 xi = alloc_tmp( c );
1864 yi = alloc_tmp( c );
1865 zi = alloc_tmp( c );
1866 t = alloc_tmp( c );
1867 for( i = 0; i < 8; i++ ) {
1868 tmp[ i ] = alloc_tmp( c );
1869 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1870 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1871 }
1872
1873 param0 = lookup_tmp( c, mark - 4 );
1874 param1 = lookup_tmp( c, mark - 3 );
1875 param2 = lookup_tmp( c, mark - 2 );
1876
1877 brw_set_access_mode( p, BRW_ALIGN_1 );
1878
1879 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1880 be hashed. Also compute the remainders (offsets within the unit
1881 cube), interleaved to reduce register dependency penalties. */
1882 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1883 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1884 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1885 brw_FRC( p, param0, param0 );
1886 brw_FRC( p, param1, param1 );
1887 brw_FRC( p, param2, param2 );
1888 /* Since we now have only 16 bits of precision in the hash, we must
1889 be more careful about thorough mixing to maintain entropy as we
1890 squash the input vector into a small scalar. */
1891 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1892 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1893 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1894 brw_imm_uw( 0x9B93 ) );
1895 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1896 brw_imm_uw( 0xBC8F ) );
1897
1898 /* Temporarily disable the execution mask while we work with ExecSize=16
1899 channels (the mask is set for ExecSize=8 and is probably incorrect).
1900 Although this might cause execution of unwanted channels, the code
1901 writes only to temporary registers and has no side effects, so
1902 disabling the mask is harmless. */
1903 brw_push_insn_state( p );
1904 brw_set_mask_control( p, BRW_MASK_DISABLE );
1905 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1906 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1907 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1908
1909 /* We're now ready to perform the hashing. The eight hashes are
1910 interleaved for performance. The hash function used is
1911 designed to rapidly achieve avalanche and require only 16x16
1912 bit multiplication, and 8-bit swizzles (which we get for
1913 free). */
1914 for( i = 0; i < 4; i++ )
1915 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1916 for( i = 0; i < 4; i++ )
1917 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1918 odd_bytes( wtmp[ i ] ) );
1919 for( i = 0; i < 4; i++ )
1920 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1921 for( i = 0; i < 4; i++ )
1922 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1923 odd_bytes( wtmp[ i ] ) );
1924 brw_pop_insn_state( p );
1925
1926 /* Now we want to initialise the four rear gradients based on the
1927 hashes. Format conversion from signed integer to float leaves
1928 everything scaled too high by a factor of pow( 2, 15 ), but
1929 we correct for that right at the end. */
1930 /* x component */
1931 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1932 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1933 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1934 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1935 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1936
1937 brw_push_insn_state( p );
1938 brw_set_mask_control( p, BRW_MASK_DISABLE );
1939 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1940 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1941 brw_pop_insn_state( p );
1942
1943 brw_MUL( p, x1y0, x1y0, t );
1944 brw_MUL( p, x1y1, x1y1, t );
1945 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1946 brw_MUL( p, x0y0, x0y0, param0 );
1947 brw_MUL( p, x0y1, x0y1, param0 );
1948
1949 /* y component */
1950 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1951 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1952 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1953 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1954
1955 brw_push_insn_state( p );
1956 brw_set_mask_control( p, BRW_MASK_DISABLE );
1957 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1958 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1959 brw_pop_insn_state( p );
1960
1961 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1962 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1963 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1964 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1965 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1966
1967 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1968 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1969 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1970 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1971
1972 /* z component */
1973 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1974 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1975 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1976 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1977
1978 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1979 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1980 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1981 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1982
1983 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1984 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1985 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1986 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1987
1988 /* We interpolate between the gradients using the polynomial
1989 6t^5 - 15t^4 + 10t^3 (Perlin). */
1990 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1991 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1992 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1993 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1994 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1995 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1996 brw_MUL( p, xi, xi, param0 );
1997 brw_MUL( p, yi, yi, param1 );
1998 brw_MUL( p, zi, zi, param2 );
1999 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
2000 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
2001 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
2002 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
2003 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
2004 brw_MUL( p, xi, xi, param0 );
2005 brw_MUL( p, yi, yi, param1 );
2006 brw_MUL( p, zi, zi, param2 );
2007 brw_MUL( p, xi, xi, param0 );
2008 brw_MUL( p, yi, yi, param1 );
2009 brw_MUL( p, zi, zi, param2 );
2010 brw_MUL( p, xi, xi, param0 );
2011 brw_MUL( p, yi, yi, param1 );
2012 brw_MUL( p, zi, zi, param2 );
2013
2014 /* Here we interpolate in the y dimension... */
2015 brw_MUL( p, x0y1, x0y1, yi );
2016 brw_MUL( p, x1y1, x1y1, yi );
2017 brw_ADD( p, x0y0, x0y0, x0y1 );
2018 brw_ADD( p, x1y0, x1y0, x1y1 );
2019
2020 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2021 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2022 brw_MUL( p, x1y0, x1y0, xi );
2023 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2024
2025 /* Now do the same thing for the front four gradients... */
2026 /* x component */
2027 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2028 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2029 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2030 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2031
2032 brw_push_insn_state( p );
2033 brw_set_mask_control( p, BRW_MASK_DISABLE );
2034 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2035 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2036 brw_pop_insn_state( p );
2037
2038 brw_MUL( p, x1y0, x1y0, t );
2039 brw_MUL( p, x1y1, x1y1, t );
2040 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2041 brw_MUL( p, x0y0, x0y0, param0 );
2042 brw_MUL( p, x0y1, x0y1, param0 );
2043
2044 /* y component */
2045 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2046 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2047 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2048 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2049
2050 brw_push_insn_state( p );
2051 brw_set_mask_control( p, BRW_MASK_DISABLE );
2052 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2053 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2054 brw_pop_insn_state( p );
2055
2056 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2057 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2058 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2059 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2060 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2061
2062 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2063 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2064 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2065 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2066
2067 /* z component */
2068 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2069 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2070 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2071 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2072
2073 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2074 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2075 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2076 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2077
2078 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2079 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2080 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2081 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2082
2083 /* The interpolation coefficients are still around from last time, so
2084 again interpolate in the y dimension... */
2085 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2086 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2087 brw_MUL( p, x0y1, x0y1, yi );
2088 brw_MUL( p, x1y1, x1y1, yi );
2089 brw_ADD( p, x0y0, x0y0, x0y1 );
2090 brw_ADD( p, x1y0, x1y0, x1y1 );
2091
2092 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2093 time put the front face in tmp[ 1 ] and we're nearly there... */
2094 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2095 brw_MUL( p, x1y0, x1y0, xi );
2096 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2097
2098 /* The final interpolation, in the z dimension: */
2099 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2100 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2101 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2102
2103 /* scale by pow( 2, -15 ), as described above */
2104 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2105
2106 release_tmps( c, mark );
2107 }
2108
2109 static void emit_noise3( struct brw_wm_compile *c,
2110 const struct prog_instruction *inst )
2111 {
2112 struct brw_compile *p = &c->func;
2113 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2114 GLuint mask = inst->DstReg.WriteMask;
2115 int i;
2116 int mark = mark_tmps( c );
2117
2118 assert( mark == 0 );
2119
2120 src0 = get_src_reg( c, inst, 0, 0 );
2121 src1 = get_src_reg( c, inst, 0, 1 );
2122 src2 = get_src_reg( c, inst, 0, 2 );
2123
2124 param0 = alloc_tmp( c );
2125 param1 = alloc_tmp( c );
2126 param2 = alloc_tmp( c );
2127
2128 brw_MOV( p, param0, src0 );
2129 brw_MOV( p, param1, src1 );
2130 brw_MOV( p, param2, src2 );
2131
2132 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2133
2134 /* Fill in the result: */
2135 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2136 for (i = 0 ; i < 4; i++) {
2137 if (mask & (1<<i)) {
2138 dst = get_dst_reg(c, inst, i);
2139 brw_MOV( p, dst, param0 );
2140 }
2141 }
2142 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2143 brw_set_saturate( p, 0 );
2144
2145 release_tmps( c, mark );
2146 }
2147
2148 /**
2149 * For the four-dimensional case, the little micro-optimisation benefits
2150 * we obtain by unrolling all the loops aren't worth the massive bloat it
2151 * now causes. Instead, we loop twice around performing a similar operation
2152 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2153 * code to glue it all together.
2154 */
2155 static void noise4_sub( struct brw_wm_compile *c )
2156 {
2157 struct brw_compile *p = &c->func;
2158 struct brw_reg param[ 4 ],
2159 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2160 w0, /* noise for the w=0 cube */
2161 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2162 interp[ 4 ], /* interpolation coefficients */
2163 t, tmp[ 8 ], /* float temporaries */
2164 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2165 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2166 int i, j;
2167 int mark = mark_tmps( c );
2168 GLuint loop, origin;
2169
2170 x0y0 = alloc_tmp( c );
2171 x0y1 = alloc_tmp( c );
2172 x1y0 = alloc_tmp( c );
2173 x1y1 = alloc_tmp( c );
2174 t = alloc_tmp( c );
2175 w0 = alloc_tmp( c );
2176 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2177 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2178
2179 for( i = 0; i < 4; i++ ) {
2180 param[ i ] = lookup_tmp( c, mark - 5 + i );
2181 interp[ i ] = alloc_tmp( c );
2182 }
2183
2184 for( i = 0; i < 8; i++ ) {
2185 tmp[ i ] = alloc_tmp( c );
2186 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2187 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2188 }
2189
2190 brw_set_access_mode( p, BRW_ALIGN_1 );
2191
2192 /* We only want 16 bits of precision from the integral part of each
2193 co-ordinate, but unfortunately the RNDD semantics would saturate
2194 at 16 bits if we performed the operation directly to a 16-bit
2195 destination. Therefore, we round to 32-bit temporaries where
2196 appropriate, and then store only the lower 16 bits. */
2197 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2198 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2199 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2200 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2201 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2202 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2203
2204 /* Modify the flag register here, because the side effect is useful
2205 later (see below). We know for certain that all flags will be
2206 cleared, since the FRC instruction cannot possibly generate
2207 negative results. Even for exceptional inputs (infinities, denormals,
2208 NaNs), the architecture guarantees that the L conditional is false. */
2209 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2210 brw_FRC( p, param[ 0 ], param[ 0 ] );
2211 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2212 for( i = 1; i < 4; i++ )
2213 brw_FRC( p, param[ i ], param[ i ] );
2214
2215 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2216 of all. */
2217 for( i = 0; i < 4; i++ )
2218 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2219 for( i = 0; i < 4; i++ )
2220 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2221 for( i = 0; i < 4; i++ )
2222 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2223 for( i = 0; i < 4; i++ )
2224 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2225 for( j = 0; j < 3; j++ )
2226 for( i = 0; i < 4; i++ )
2227 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2228
2229 /* Mark the current address, as it will be a jump destination. The
2230 following code will be executed twice: first, with the flag
2231 register clear indicating the w=0 case, and second with flags
2232 set for w=1. */
2233 loop = p->nr_insn;
2234
2235 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2236 be hashed. Since we have only 16 bits of precision in the hash, we
2237 must be careful about thorough mixing to maintain entropy as we
2238 squash the input vector into a small scalar. */
2239 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2240 brw_imm_uw( 0xBC8F ) );
2241 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2242 brw_imm_uw( 0xD0BD ) );
2243 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2244 brw_imm_uw( 0x9B93 ) );
2245 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2246 brw_imm_uw( 0xA359 ) );
2247 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2248 brw_imm_uw( 0xBC8F ) );
2249
2250 /* Temporarily disable the execution mask while we work with ExecSize=16
2251 channels (the mask is set for ExecSize=8 and is probably incorrect).
2252 Although this might cause execution of unwanted channels, the code
2253 writes only to temporary registers and has no side effects, so
2254 disabling the mask is harmless. */
2255 brw_push_insn_state( p );
2256 brw_set_mask_control( p, BRW_MASK_DISABLE );
2257 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2258 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2259 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2260
2261 /* We're now ready to perform the hashing. The eight hashes are
2262 interleaved for performance. The hash function used is
2263 designed to rapidly achieve avalanche and require only 16x16
2264 bit multiplication, and 8-bit swizzles (which we get for
2265 free). */
2266 for( i = 0; i < 4; i++ )
2267 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2268 for( i = 0; i < 4; i++ )
2269 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2270 odd_bytes( wtmp[ i ] ) );
2271 for( i = 0; i < 4; i++ )
2272 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2273 for( i = 0; i < 4; i++ )
2274 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2275 odd_bytes( wtmp[ i ] ) );
2276 brw_pop_insn_state( p );
2277
2278 /* Now we want to initialise the four rear gradients based on the
2279 hashes. Format conversion from signed integer to float leaves
2280 everything scaled too high by a factor of pow( 2, 15 ), but
2281 we correct for that right at the end. */
2282 /* x component */
2283 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2284 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2285 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2286 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2287 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2288
2289 brw_push_insn_state( p );
2290 brw_set_mask_control( p, BRW_MASK_DISABLE );
2291 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2292 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2293 brw_pop_insn_state( p );
2294
2295 brw_MUL( p, x1y0, x1y0, t );
2296 brw_MUL( p, x1y1, x1y1, t );
2297 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2298 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2299 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2300
2301 /* y component */
2302 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2303 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2304 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2305 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2306
2307 brw_push_insn_state( p );
2308 brw_set_mask_control( p, BRW_MASK_DISABLE );
2309 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2310 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2311 brw_pop_insn_state( p );
2312
2313 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2314 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2315 /* prepare t for the w component (used below): w the first time through
2316 the loop; w - 1 the second time) */
2317 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2318 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2319 p->current->header.predicate_inverse = 1;
2320 brw_MOV( p, t, param[ 3 ] );
2321 p->current->header.predicate_inverse = 0;
2322 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2323 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2324 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2325
2326 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2327 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2328 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2329 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2330
2331 /* z component */
2332 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2333 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2334 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2335 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2336
2337 brw_push_insn_state( p );
2338 brw_set_mask_control( p, BRW_MASK_DISABLE );
2339 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2340 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2341 brw_pop_insn_state( p );
2342
2343 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2344 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2345 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2346 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2347
2348 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2349 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2350 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2351 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2352
2353 /* w component */
2354 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2355 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2356 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2357 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2358
2359 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2360 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2361 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2362 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2363 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2364
2365 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2366 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2367 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2368 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2369
2370 /* Here we interpolate in the y dimension... */
2371 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2372 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2373 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2374 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2375 brw_ADD( p, x0y0, x0y0, x0y1 );
2376 brw_ADD( p, x1y0, x1y0, x1y1 );
2377
2378 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2379 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2380 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2381 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2382
2383 /* Now do the same thing for the front four gradients... */
2384 /* x component */
2385 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2386 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2387 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2388 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2389
2390 brw_push_insn_state( p );
2391 brw_set_mask_control( p, BRW_MASK_DISABLE );
2392 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2393 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2394 brw_pop_insn_state( p );
2395
2396 brw_MUL( p, x1y0, x1y0, t );
2397 brw_MUL( p, x1y1, x1y1, t );
2398 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2399 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2400 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2401
2402 /* y component */
2403 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2404 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2405 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2406 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2407
2408 brw_push_insn_state( p );
2409 brw_set_mask_control( p, BRW_MASK_DISABLE );
2410 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2411 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2412 brw_pop_insn_state( p );
2413
2414 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2415 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2416 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2417 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2418 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2419
2420 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2421 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2422 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2423 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2424
2425 /* z component */
2426 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2427 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2428 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2429 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2430
2431 brw_push_insn_state( p );
2432 brw_set_mask_control( p, BRW_MASK_DISABLE );
2433 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2434 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2435 brw_pop_insn_state( p );
2436
2437 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2438 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2439 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2440 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2441 /* prepare t for the w component (used below): w the first time through
2442 the loop; w - 1 the second time) */
2443 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2444 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2445 p->current->header.predicate_inverse = 1;
2446 brw_MOV( p, t, param[ 3 ] );
2447 p->current->header.predicate_inverse = 0;
2448 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2449
2450 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2451 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2452 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2453 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2454
2455 /* w component */
2456 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2457 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2458 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2459 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2460
2461 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2462 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2463 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2464 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2465
2466 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2467 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2468 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2469 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2470
2471 /* Interpolate in the y dimension: */
2472 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2473 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2474 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2475 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2476 brw_ADD( p, x0y0, x0y0, x0y1 );
2477 brw_ADD( p, x1y0, x1y0, x1y1 );
2478
2479 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2480 time put the front face in tmp[ 1 ] and we're nearly there... */
2481 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2482 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2483 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2484
2485 /* Another interpolation, in the z dimension: */
2486 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2487 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2488 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2489
2490 /* Exit the loop if we've computed both cubes... */
2491 origin = p->nr_insn;
2492 brw_push_insn_state( p );
2493 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2494 brw_set_mask_control( p, BRW_MASK_DISABLE );
2495 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2496 brw_pop_insn_state( p );
2497
2498 /* Save the result for the w=0 case, and increment the w coordinate: */
2499 brw_MOV( p, w0, tmp[ 0 ] );
2500 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2501 brw_imm_uw( 1 ) );
2502
2503 /* Loop around for the other cube. Explicitly set the flag register
2504 (unfortunately we must spend an extra instruction to do this: we
2505 can't rely on a side effect of the previous MOV or ADD because
2506 conditional modifiers which are normally true might be false in
2507 exceptional circumstances, e.g. given a NaN input; the add to
2508 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2509 brw_push_insn_state( p );
2510 brw_set_mask_control( p, BRW_MASK_DISABLE );
2511 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2512 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2513 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2514 brw_pop_insn_state( p );
2515
2516 /* Patch the previous conditional branch now that we know the
2517 destination address. */
2518 brw_set_src1( p->store + origin,
2519 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2520
2521 /* The very last interpolation. */
2522 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2523 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2524 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2525
2526 /* scale by pow( 2, -15 ), as described above */
2527 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2528
2529 release_tmps( c, mark );
2530 }
2531
2532 static void emit_noise4( struct brw_wm_compile *c,
2533 const struct prog_instruction *inst )
2534 {
2535 struct brw_compile *p = &c->func;
2536 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2537 GLuint mask = inst->DstReg.WriteMask;
2538 int i;
2539 int mark = mark_tmps( c );
2540
2541 assert( mark == 0 );
2542
2543 src0 = get_src_reg( c, inst, 0, 0 );
2544 src1 = get_src_reg( c, inst, 0, 1 );
2545 src2 = get_src_reg( c, inst, 0, 2 );
2546 src3 = get_src_reg( c, inst, 0, 3 );
2547
2548 param0 = alloc_tmp( c );
2549 param1 = alloc_tmp( c );
2550 param2 = alloc_tmp( c );
2551 param3 = alloc_tmp( c );
2552
2553 brw_MOV( p, param0, src0 );
2554 brw_MOV( p, param1, src1 );
2555 brw_MOV( p, param2, src2 );
2556 brw_MOV( p, param3, src3 );
2557
2558 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2559
2560 /* Fill in the result: */
2561 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2562 for (i = 0 ; i < 4; i++) {
2563 if (mask & (1<<i)) {
2564 dst = get_dst_reg(c, inst, i);
2565 brw_MOV( p, dst, param0 );
2566 }
2567 }
2568 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2569 brw_set_saturate( p, 0 );
2570
2571 release_tmps( c, mark );
2572 }
2573
2574 static void emit_wpos_xy(struct brw_wm_compile *c,
2575 const struct prog_instruction *inst)
2576 {
2577 struct brw_compile *p = &c->func;
2578 GLuint mask = inst->DstReg.WriteMask;
2579 struct brw_reg src0[2], dst[2];
2580
2581 dst[0] = get_dst_reg(c, inst, 0);
2582 dst[1] = get_dst_reg(c, inst, 1);
2583
2584 src0[0] = get_src_reg(c, inst, 0, 0);
2585 src0[1] = get_src_reg(c, inst, 0, 1);
2586
2587 /* Calculate the pixel offset from window bottom left into destination
2588 * X and Y channels.
2589 */
2590 if (mask & WRITEMASK_X) {
2591 /* X' = X - origin_x */
2592 brw_ADD(p,
2593 dst[0],
2594 retype(src0[0], BRW_REGISTER_TYPE_W),
2595 brw_imm_d(0 - c->key.origin_x));
2596 }
2597
2598 if (mask & WRITEMASK_Y) {
2599 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2600 brw_ADD(p,
2601 dst[1],
2602 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2603 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2604 }
2605 }
2606
2607 /* TODO
2608 BIAS on SIMD8 not working yet...
2609 */
2610 static void emit_txb(struct brw_wm_compile *c,
2611 const struct prog_instruction *inst)
2612 {
2613 struct brw_compile *p = &c->func;
2614 struct brw_reg dst[4], src[4], payload_reg;
2615 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2616 GLuint i;
2617 GLuint msg_type;
2618
2619 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2620
2621 for (i = 0; i < 4; i++)
2622 dst[i] = get_dst_reg(c, inst, i);
2623 for (i = 0; i < 4; i++)
2624 src[i] = get_src_reg(c, inst, 0, i);
2625
2626 switch (inst->TexSrcTarget) {
2627 case TEXTURE_1D_INDEX:
2628 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2629 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2630 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2631 break;
2632 case TEXTURE_2D_INDEX:
2633 case TEXTURE_RECT_INDEX:
2634 brw_MOV(p, brw_message_reg(2), src[0]);
2635 brw_MOV(p, brw_message_reg(3), src[1]);
2636 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2637 break;
2638 default:
2639 brw_MOV(p, brw_message_reg(2), src[0]);
2640 brw_MOV(p, brw_message_reg(3), src[1]);
2641 brw_MOV(p, brw_message_reg(4), src[2]);
2642 break;
2643 }
2644 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2645 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2646
2647 if (BRW_IS_IGDNG(p->brw)) {
2648 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2649 } else {
2650 /* Does it work well on SIMD8? */
2651 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2652 }
2653
2654 brw_SAMPLE(p,
2655 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2656 1, /* msg_reg_nr */
2657 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2658 SURF_INDEX_TEXTURE(unit),
2659 unit, /* sampler */
2660 inst->DstReg.WriteMask, /* writemask */
2661 msg_type, /* msg_type */
2662 4, /* response_length */
2663 4, /* msg_length */
2664 0, /* eot */
2665 1,
2666 BRW_SAMPLER_SIMD_MODE_SIMD8);
2667 }
2668
2669
2670 static void emit_tex(struct brw_wm_compile *c,
2671 const struct prog_instruction *inst)
2672 {
2673 struct brw_compile *p = &c->func;
2674 struct brw_reg dst[4], src[4], payload_reg;
2675 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2676 GLuint msg_len;
2677 GLuint i, nr;
2678 GLuint emit;
2679 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2680 GLuint msg_type;
2681
2682 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2683
2684 for (i = 0; i < 4; i++)
2685 dst[i] = get_dst_reg(c, inst, i);
2686 for (i = 0; i < 4; i++)
2687 src[i] = get_src_reg(c, inst, 0, i);
2688
2689 switch (inst->TexSrcTarget) {
2690 case TEXTURE_1D_INDEX:
2691 emit = WRITEMASK_X;
2692 nr = 1;
2693 break;
2694 case TEXTURE_2D_INDEX:
2695 case TEXTURE_RECT_INDEX:
2696 emit = WRITEMASK_XY;
2697 nr = 2;
2698 break;
2699 default:
2700 emit = WRITEMASK_XYZ;
2701 nr = 3;
2702 break;
2703 }
2704 msg_len = 1;
2705
2706 /* move/load S, T, R coords */
2707 for (i = 0; i < nr; i++) {
2708 static const GLuint swz[4] = {0,1,2,2};
2709 if (emit & (1<<i))
2710 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2711 else
2712 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2713 msg_len += 1;
2714 }
2715
2716 if (shadow) {
2717 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2718 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2719 }
2720
2721 if (BRW_IS_IGDNG(p->brw)) {
2722 if (shadow)
2723 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2724 else
2725 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2726 } else {
2727 /* Does it work for shadow on SIMD8 ? */
2728 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2729 }
2730
2731 brw_SAMPLE(p,
2732 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2733 1, /* msg_reg_nr */
2734 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2735 SURF_INDEX_TEXTURE(unit),
2736 unit, /* sampler */
2737 inst->DstReg.WriteMask, /* writemask */
2738 msg_type, /* msg_type */
2739 4, /* response_length */
2740 shadow ? 6 : 4, /* msg_length */
2741 0, /* eot */
2742 1,
2743 BRW_SAMPLER_SIMD_MODE_SIMD8);
2744
2745 if (shadow)
2746 brw_MOV(p, dst[3], brw_imm_f(1.0));
2747 }
2748
2749
2750 /**
2751 * Resolve subroutine calls after code emit is done.
2752 */
2753 static void post_wm_emit( struct brw_wm_compile *c )
2754 {
2755 brw_resolve_cals(&c->func);
2756 }
2757
2758 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2759 {
2760 #define MAX_IF_DEPTH 32
2761 #define MAX_LOOP_DEPTH 32
2762 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2763 GLuint i, if_depth = 0, loop_depth = 0;
2764 struct brw_compile *p = &c->func;
2765 struct brw_indirect stack_index = brw_indirect(0, 0);
2766
2767 c->out_of_regs = GL_FALSE;
2768
2769 prealloc_reg(c);
2770 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2771 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2772
2773 for (i = 0; i < c->nr_fp_insns; i++) {
2774 const struct prog_instruction *inst = &c->prog_instructions[i];
2775
2776 c->cur_inst = i;
2777
2778 #if 0
2779 _mesa_printf("Inst %d: ", i);
2780 _mesa_print_instruction(inst);
2781 #endif
2782
2783 /* fetch any constants that this instruction needs */
2784 if (c->fp->use_const_buffer)
2785 fetch_constants(c, inst);
2786
2787 if (inst->CondUpdate)
2788 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2789 else
2790 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2791
2792 switch (inst->Opcode) {
2793 case WM_PIXELXY:
2794 emit_pixel_xy(c, inst);
2795 break;
2796 case WM_DELTAXY:
2797 emit_delta_xy(c, inst);
2798 break;
2799 case WM_PIXELW:
2800 emit_pixel_w(c, inst);
2801 break;
2802 case WM_LINTERP:
2803 emit_linterp(c, inst);
2804 break;
2805 case WM_PINTERP:
2806 emit_pinterp(c, inst);
2807 break;
2808 case WM_CINTERP:
2809 emit_cinterp(c, inst);
2810 break;
2811 case WM_WPOSXY:
2812 emit_wpos_xy(c, inst);
2813 break;
2814 case WM_FB_WRITE:
2815 emit_fb_write(c, inst);
2816 break;
2817 case WM_FRONTFACING:
2818 emit_frontfacing(c, inst);
2819 break;
2820 case OPCODE_ADD:
2821 emit_add(c, inst);
2822 break;
2823 case OPCODE_ARL:
2824 emit_arl(c, inst);
2825 break;
2826 case OPCODE_FRC:
2827 emit_frc(c, inst);
2828 break;
2829 case OPCODE_FLR:
2830 emit_flr(c, inst);
2831 break;
2832 case OPCODE_LRP:
2833 emit_lrp(c, inst);
2834 break;
2835 case OPCODE_TRUNC:
2836 emit_trunc(c, inst);
2837 break;
2838 case OPCODE_MOV:
2839 case OPCODE_SWZ:
2840 emit_mov(c, inst);
2841 break;
2842 case OPCODE_DP3:
2843 emit_dp3(c, inst);
2844 break;
2845 case OPCODE_DP4:
2846 emit_dp4(c, inst);
2847 break;
2848 case OPCODE_XPD:
2849 emit_xpd(c, inst);
2850 break;
2851 case OPCODE_DPH:
2852 emit_dph(c, inst);
2853 break;
2854 case OPCODE_RCP:
2855 emit_rcp(c, inst);
2856 break;
2857 case OPCODE_RSQ:
2858 emit_rsq(c, inst);
2859 break;
2860 case OPCODE_SIN:
2861 emit_sin(c, inst);
2862 break;
2863 case OPCODE_COS:
2864 emit_cos(c, inst);
2865 break;
2866 case OPCODE_EX2:
2867 emit_ex2(c, inst);
2868 break;
2869 case OPCODE_LG2:
2870 emit_lg2(c, inst);
2871 break;
2872 case OPCODE_MIN:
2873 case OPCODE_MAX:
2874 emit_min_max(c, inst);
2875 break;
2876 case OPCODE_DDX:
2877 emit_ddx(c, inst);
2878 break;
2879 case OPCODE_DDY:
2880 emit_ddy(c, inst);
2881 break;
2882 case OPCODE_SLT:
2883 emit_slt(c, inst);
2884 break;
2885 case OPCODE_SLE:
2886 emit_sle(c, inst);
2887 break;
2888 case OPCODE_SGT:
2889 emit_sgt(c, inst);
2890 break;
2891 case OPCODE_SGE:
2892 emit_sge(c, inst);
2893 break;
2894 case OPCODE_SEQ:
2895 emit_seq(c, inst);
2896 break;
2897 case OPCODE_SNE:
2898 emit_sne(c, inst);
2899 break;
2900 case OPCODE_MUL:
2901 emit_mul(c, inst);
2902 break;
2903 case OPCODE_POW:
2904 emit_pow(c, inst);
2905 break;
2906 case OPCODE_MAD:
2907 emit_mad(c, inst);
2908 break;
2909 case OPCODE_NOISE1:
2910 emit_noise1(c, inst);
2911 break;
2912 case OPCODE_NOISE2:
2913 emit_noise2(c, inst);
2914 break;
2915 case OPCODE_NOISE3:
2916 emit_noise3(c, inst);
2917 break;
2918 case OPCODE_NOISE4:
2919 emit_noise4(c, inst);
2920 break;
2921 case OPCODE_TEX:
2922 emit_tex(c, inst);
2923 break;
2924 case OPCODE_TXB:
2925 emit_txb(c, inst);
2926 break;
2927 case OPCODE_KIL_NV:
2928 emit_kil(c);
2929 break;
2930 case OPCODE_IF:
2931 assert(if_depth < MAX_IF_DEPTH);
2932 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2933 break;
2934 case OPCODE_ELSE:
2935 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2936 break;
2937 case OPCODE_ENDIF:
2938 assert(if_depth > 0);
2939 brw_ENDIF(p, if_inst[--if_depth]);
2940 break;
2941 case OPCODE_BGNSUB:
2942 brw_save_label(p, inst->Comment, p->nr_insn);
2943 break;
2944 case OPCODE_ENDSUB:
2945 /* no-op */
2946 break;
2947 case OPCODE_CAL:
2948 brw_push_insn_state(p);
2949 brw_set_mask_control(p, BRW_MASK_DISABLE);
2950 brw_set_access_mode(p, BRW_ALIGN_1);
2951 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2952 brw_set_access_mode(p, BRW_ALIGN_16);
2953 brw_ADD(p, get_addr_reg(stack_index),
2954 get_addr_reg(stack_index), brw_imm_d(4));
2955 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2956 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2957 brw_pop_insn_state(p);
2958 break;
2959
2960 case OPCODE_RET:
2961 brw_push_insn_state(p);
2962 brw_set_mask_control(p, BRW_MASK_DISABLE);
2963 brw_ADD(p, get_addr_reg(stack_index),
2964 get_addr_reg(stack_index), brw_imm_d(-4));
2965 brw_set_access_mode(p, BRW_ALIGN_1);
2966 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2967 brw_set_access_mode(p, BRW_ALIGN_16);
2968 brw_pop_insn_state(p);
2969
2970 break;
2971 case OPCODE_BGNLOOP:
2972 /* XXX may need to invalidate the current_constant regs */
2973 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2974 break;
2975 case OPCODE_BRK:
2976 brw_BREAK(p);
2977 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2978 break;
2979 case OPCODE_CONT:
2980 brw_CONT(p);
2981 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2982 break;
2983 case OPCODE_ENDLOOP:
2984 {
2985 struct brw_instruction *inst0, *inst1;
2986 GLuint br = 1;
2987
2988 if (BRW_IS_IGDNG(brw))
2989 br = 2;
2990
2991 loop_depth--;
2992 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2993 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2994 while (inst0 > loop_inst[loop_depth]) {
2995 inst0--;
2996 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2997 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2998 inst0->bits3.if_else.pop_count = 0;
2999 }
3000 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
3001 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3002 inst0->bits3.if_else.pop_count = 0;
3003 }
3004 }
3005 }
3006 break;
3007 default:
3008 _mesa_printf("unsupported IR in fragment shader %d\n",
3009 inst->Opcode);
3010 }
3011
3012 if (inst->CondUpdate)
3013 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3014 else
3015 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3016 }
3017 post_wm_emit(c);
3018
3019 if (INTEL_DEBUG & DEBUG_WM) {
3020 _mesa_printf("wm-native:\n");
3021 for (i = 0; i < p->nr_insn; i++)
3022 brw_disasm(stderr, &p->store[i]);
3023 _mesa_printf("\n");
3024 }
3025 }
3026
3027 /**
3028 * Do GPU code generation for shaders that use GLSL features such as
3029 * flow control. Other shaders will be compiled with the
3030 */
3031 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3032 {
3033 if (INTEL_DEBUG & DEBUG_WM) {
3034 _mesa_printf("brw_wm_glsl_emit:\n");
3035 }
3036
3037 /* initial instruction translation/simplification */
3038 brw_wm_pass_fp(c);
3039
3040 /* actual code generation */
3041 brw_wm_emit_glsl(brw, c);
3042
3043 if (INTEL_DEBUG & DEBUG_WM) {
3044 brw_wm_print_program(c, "brw_wm_glsl_emit done");
3045 }
3046
3047 c->prog_data.total_grf = num_grf_used(c);
3048 c->prog_data.total_scratch = 0;
3049 }