Merge branch 'mesa_7_5_branch'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13
14 /**
15 * Determine if the given fragment program uses GLSL features such
16 * as flow conditionals, loops, subroutines.
17 * Some GLSL shaders may use these features, others might not.
18 */
19 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
20 {
21 int i;
22 for (i = 0; i < fp->Base.NumInstructions; i++) {
23 const struct prog_instruction *inst = &fp->Base.Instructions[i];
24 switch (inst->Opcode) {
25 case OPCODE_IF:
26 case OPCODE_TRUNC:
27 case OPCODE_ENDIF:
28 case OPCODE_CAL:
29 case OPCODE_BRK:
30 case OPCODE_RET:
31 case OPCODE_DDX:
32 case OPCODE_DDY:
33 case OPCODE_NOISE1:
34 case OPCODE_NOISE2:
35 case OPCODE_NOISE3:
36 case OPCODE_NOISE4:
37 case OPCODE_BGNLOOP:
38 return GL_TRUE;
39 default:
40 break;
41 }
42 }
43 return GL_FALSE;
44 }
45
46
47
48 static void
49 reclaim_temps(struct brw_wm_compile *c);
50
51
52 /** Mark GRF register as used. */
53 static void
54 prealloc_grf(struct brw_wm_compile *c, int r)
55 {
56 c->used_grf[r] = GL_TRUE;
57 }
58
59
60 /** Mark given GRF register as not in use. */
61 static void
62 release_grf(struct brw_wm_compile *c, int r)
63 {
64 /*assert(c->used_grf[r]);*/
65 c->used_grf[r] = GL_FALSE;
66 c->first_free_grf = MIN2(c->first_free_grf, r);
67 }
68
69
70 /** Return index of a free GRF, mark it as used. */
71 static int
72 alloc_grf(struct brw_wm_compile *c)
73 {
74 GLuint r;
75 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
76 if (!c->used_grf[r]) {
77 c->used_grf[r] = GL_TRUE;
78 c->first_free_grf = r + 1; /* a guess */
79 return r;
80 }
81 }
82
83 /* no free temps, try to reclaim some */
84 reclaim_temps(c);
85 c->first_free_grf = 0;
86
87 /* try alloc again */
88 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
89 if (!c->used_grf[r]) {
90 c->used_grf[r] = GL_TRUE;
91 c->first_free_grf = r + 1; /* a guess */
92 return r;
93 }
94 }
95
96 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
97 assert(c->used_grf[r]);
98 }
99
100 /* really, no free GRF regs found */
101 if (!c->out_of_regs) {
102 /* print warning once per compilation */
103 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
104 c->out_of_regs = GL_TRUE;
105 }
106
107 return -1;
108 }
109
110
111 /** Return number of GRF registers used */
112 static int
113 num_grf_used(const struct brw_wm_compile *c)
114 {
115 int r;
116 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
117 if (c->used_grf[r])
118 return r + 1;
119 return 0;
120 }
121
122
123
124 /**
125 * Record the mapping of a Mesa register to a hardware register.
126 */
127 static void set_reg(struct brw_wm_compile *c, int file, int index,
128 int component, struct brw_reg reg)
129 {
130 c->wm_regs[file][index][component].reg = reg;
131 c->wm_regs[file][index][component].inited = GL_TRUE;
132 }
133
134 /**
135 * Examine instruction's write mask to find index of first component
136 * enabled for writing.
137 */
138 static int get_scalar_dst_index(const struct prog_instruction *inst)
139 {
140 int i;
141 for (i = 0; i < 4; i++)
142 if (inst->DstReg.WriteMask & (1<<i))
143 break;
144 return i;
145 }
146
147 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
148 {
149 struct brw_reg reg;
150
151 /* if we need to allocate another temp, grow the tmp_regs[] array */
152 if (c->tmp_index == c->tmp_max) {
153 int r = alloc_grf(c);
154 if (r < 0) {
155 /*printf("Out of temps in %s\n", __FUNCTION__);*/
156 r = 50; /* XXX random register! */
157 }
158 c->tmp_regs[ c->tmp_max++ ] = r;
159 }
160
161 /* form the GRF register */
162 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
163 /*printf("alloc_temp %d\n", reg.nr);*/
164 assert(reg.nr < BRW_WM_MAX_GRF);
165 return reg;
166
167 }
168
169 /**
170 * Save current temp register info.
171 * There must be a matching call to release_tmps().
172 */
173 static int mark_tmps(struct brw_wm_compile *c)
174 {
175 return c->tmp_index;
176 }
177
178 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
179 {
180 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
181 }
182
183 static void release_tmps(struct brw_wm_compile *c, int mark)
184 {
185 c->tmp_index = mark;
186 }
187
188 /**
189 * Convert Mesa src register to brw register.
190 *
191 * Since we're running in SOA mode each Mesa register corresponds to four
192 * hardware registers. We allocate the hardware registers as needed here.
193 *
194 * \param file register file, one of PROGRAM_x
195 * \param index register number
196 * \param component src component (X=0, Y=1, Z=2, W=3)
197 * \param nr not used?!?
198 * \param neg negate value?
199 * \param abs take absolute value?
200 */
201 static struct brw_reg
202 get_reg(struct brw_wm_compile *c, int file, int index, int component,
203 int nr, GLuint neg, GLuint abs)
204 {
205 struct brw_reg reg;
206 switch (file) {
207 case PROGRAM_STATE_VAR:
208 case PROGRAM_CONSTANT:
209 case PROGRAM_UNIFORM:
210 file = PROGRAM_STATE_VAR;
211 break;
212 case PROGRAM_UNDEFINED:
213 return brw_null_reg();
214 case PROGRAM_TEMPORARY:
215 case PROGRAM_INPUT:
216 case PROGRAM_OUTPUT:
217 case PROGRAM_PAYLOAD:
218 break;
219 default:
220 _mesa_problem(NULL, "Unexpected file in get_reg()");
221 return brw_null_reg();
222 }
223
224 assert(index < 256);
225 assert(component < 4);
226
227 /* see if we've already allocated a HW register for this Mesa register */
228 if (c->wm_regs[file][index][component].inited) {
229 /* yes, re-use */
230 reg = c->wm_regs[file][index][component].reg;
231 }
232 else {
233 /* no, allocate new register */
234 int grf = alloc_grf(c);
235 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
236 if (grf < 0) {
237 /* totally out of temps */
238 grf = 51; /* XXX random register! */
239 }
240
241 reg = brw_vec8_grf(grf, 0);
242 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
243
244 set_reg(c, file, index, component, reg);
245 }
246
247 if (neg & (1 << component)) {
248 reg = negate(reg);
249 }
250 if (abs)
251 reg = brw_abs(reg);
252 return reg;
253 }
254
255
256
257 /**
258 * This is called if we run out of GRF registers. Examine the live intervals
259 * of temp regs in the program and free those which won't be used again.
260 */
261 static void
262 reclaim_temps(struct brw_wm_compile *c)
263 {
264 GLint intBegin[MAX_PROGRAM_TEMPS];
265 GLint intEnd[MAX_PROGRAM_TEMPS];
266 int index;
267
268 /*printf("Reclaim temps:\n");*/
269
270 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
271 intBegin, intEnd);
272
273 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
274 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
275 /* program temp[i] can be freed */
276 int component;
277 /*printf(" temp[%d] is dead\n", index);*/
278 for (component = 0; component < 4; component++) {
279 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
280 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
281 release_grf(c, r);
282 /*
283 printf(" Reclaim temp %d, reg %d at inst %d\n",
284 index, r, c->cur_inst);
285 */
286 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
287 }
288 }
289 }
290 }
291 }
292
293
294
295
296 /**
297 * Preallocate registers. This sets up the Mesa to hardware register
298 * mapping for certain registers, such as constants (uniforms/state vars)
299 * and shader inputs.
300 */
301 static void prealloc_reg(struct brw_wm_compile *c)
302 {
303 int i, j;
304 struct brw_reg reg;
305 int urb_read_length = 0;
306 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
307 GLuint reg_index = 0;
308
309 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
310 c->first_free_grf = 0;
311
312 for (i = 0; i < 4; i++) {
313 if (i < c->key.nr_depth_regs)
314 reg = brw_vec8_grf(i * 2, 0);
315 else
316 reg = brw_vec8_grf(0, 0);
317 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
318 }
319 reg_index += 2 * c->key.nr_depth_regs;
320
321 /* constants */
322 {
323 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
324 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
325
326 /* use a real constant buffer, or just use a section of the GRF? */
327 /* XXX this heuristic may need adjustment... */
328 if ((nr_params + nr_temps) * 4 + reg_index > 80)
329 c->fp->use_const_buffer = GL_TRUE;
330 else
331 c->fp->use_const_buffer = GL_FALSE;
332 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
333
334 if (c->fp->use_const_buffer) {
335 /* We'll use a real constant buffer and fetch constants from
336 * it with a dataport read message.
337 */
338
339 /* number of float constants in CURBE */
340 c->prog_data.nr_params = 0;
341 }
342 else {
343 const struct gl_program_parameter_list *plist =
344 c->fp->program.Base.Parameters;
345 int index = 0;
346
347 /* number of float constants in CURBE */
348 c->prog_data.nr_params = 4 * nr_params;
349
350 /* loop over program constants (float[4]) */
351 for (i = 0; i < nr_params; i++) {
352 /* loop over XYZW channels */
353 for (j = 0; j < 4; j++, index++) {
354 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
355 /* Save pointer to parameter/constant value.
356 * Constants will be copied in prepare_constant_buffer()
357 */
358 c->prog_data.param[index] = &plist->ParameterValues[i][j];
359 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
360 }
361 }
362 /* number of constant regs used (each reg is float[8]) */
363 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
364 reg_index += c->nr_creg;
365 }
366 }
367
368 /* fragment shader inputs */
369 for (i = 0; i < VERT_RESULT_MAX; i++) {
370 int fp_input;
371
372 if (i >= VERT_RESULT_VAR0)
373 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
374 else if (i <= VERT_RESULT_TEX7)
375 fp_input = i;
376 else
377 fp_input = -1;
378
379 if (fp_input >= 0 && inputs & (1 << fp_input)) {
380 urb_read_length = reg_index;
381 reg = brw_vec8_grf(reg_index, 0);
382 for (j = 0; j < 4; j++)
383 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
384 }
385 if (c->key.vp_outputs_written & (1 << i)) {
386 reg_index += 2;
387 }
388 }
389
390 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
391 c->prog_data.urb_read_length = urb_read_length;
392 c->prog_data.curb_read_length = c->nr_creg;
393 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
394 reg_index++;
395 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
396 reg_index += 2;
397
398 /* mark GRF regs [0..reg_index-1] as in-use */
399 for (i = 0; i < reg_index; i++)
400 prealloc_grf(c, i);
401
402 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
403 prealloc_grf(c, 126);
404 prealloc_grf(c, 127);
405
406 /* An instruction may reference up to three constants.
407 * They'll be found in these registers.
408 * XXX alloc these on demand!
409 */
410 if (c->fp->use_const_buffer) {
411 for (i = 0; i < 3; i++) {
412 c->current_const[i].index = -1;
413 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
414 }
415 }
416 #if 0
417 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
418 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
419 #endif
420 }
421
422
423 /**
424 * Check if any of the instruction's src registers are constants, uniforms,
425 * or statevars. If so, fetch any constants that we don't already have in
426 * the three GRF slots.
427 */
428 static void fetch_constants(struct brw_wm_compile *c,
429 const struct prog_instruction *inst)
430 {
431 struct brw_compile *p = &c->func;
432 GLuint i;
433
434 /* loop over instruction src regs */
435 for (i = 0; i < 3; i++) {
436 const struct prog_src_register *src = &inst->SrcReg[i];
437 if (src->File == PROGRAM_STATE_VAR ||
438 src->File == PROGRAM_CONSTANT ||
439 src->File == PROGRAM_UNIFORM) {
440 c->current_const[i].index = src->Index;
441
442 #if 0
443 printf(" fetch const[%d] for arg %d into reg %d\n",
444 src->Index, i, c->current_const[i].reg.nr);
445 #endif
446
447 /* need to fetch the constant now */
448 brw_dp_READ_4(p,
449 c->current_const[i].reg, /* writeback dest */
450 1, /* msg_reg */
451 src->RelAddr, /* relative indexing? */
452 16 * src->Index, /* byte offset */
453 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
454 );
455 }
456 }
457 }
458
459
460 /**
461 * Convert Mesa dst register to brw register.
462 */
463 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
464 const struct prog_instruction *inst,
465 GLuint component)
466 {
467 const int nr = 1;
468 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
469 0, 0);
470 }
471
472
473 static struct brw_reg
474 get_src_reg_const(struct brw_wm_compile *c,
475 const struct prog_instruction *inst,
476 GLuint srcRegIndex, GLuint component)
477 {
478 /* We should have already fetched the constant from the constant
479 * buffer in fetch_constants(). Now we just have to return a
480 * register description that extracts the needed component and
481 * smears it across all eight vector components.
482 */
483 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
484 struct brw_reg const_reg;
485
486 assert(component < 4);
487 assert(srcRegIndex < 3);
488 assert(c->current_const[srcRegIndex].index != -1);
489 const_reg = c->current_const[srcRegIndex].reg;
490
491 /* extract desired float from the const_reg, and smear */
492 const_reg = stride(const_reg, 0, 1, 0);
493 const_reg.subnr = component * 4;
494
495 if (src->Negate & (1 << component))
496 const_reg = negate(const_reg);
497 if (src->Abs)
498 const_reg = brw_abs(const_reg);
499
500 #if 0
501 printf(" form const[%d].%d for arg %d, reg %d\n",
502 c->current_const[srcRegIndex].index,
503 component,
504 srcRegIndex,
505 const_reg.nr);
506 #endif
507
508 return const_reg;
509 }
510
511
512 /**
513 * Convert Mesa src register to brw register.
514 */
515 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
516 const struct prog_instruction *inst,
517 GLuint srcRegIndex, GLuint channel)
518 {
519 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
520 const GLuint nr = 1;
521 const GLuint component = GET_SWZ(src->Swizzle, channel);
522
523 /* Extended swizzle terms */
524 if (component == SWIZZLE_ZERO) {
525 return brw_imm_f(0.0F);
526 }
527 else if (component == SWIZZLE_ONE) {
528 return brw_imm_f(1.0F);
529 }
530
531 if (c->fp->use_const_buffer &&
532 (src->File == PROGRAM_STATE_VAR ||
533 src->File == PROGRAM_CONSTANT ||
534 src->File == PROGRAM_UNIFORM)) {
535 return get_src_reg_const(c, inst, srcRegIndex, component);
536 }
537 else {
538 /* other type of source register */
539 return get_reg(c, src->File, src->Index, component, nr,
540 src->Negate, src->Abs);
541 }
542 }
543
544
545 /**
546 * Same as \sa get_src_reg() but if the register is a literal, emit
547 * a brw_reg encoding the literal.
548 * Note that a brw instruction only allows one src operand to be a literal.
549 * For instructions with more than one operand, only the second can be a
550 * literal. This means that we treat some literals as constants/uniforms
551 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
552 *
553 */
554 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
555 const struct prog_instruction *inst,
556 GLuint srcRegIndex, GLuint channel)
557 {
558 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
559 if (src->File == PROGRAM_CONSTANT) {
560 /* a literal */
561 const int component = GET_SWZ(src->Swizzle, channel);
562 const GLfloat *param =
563 c->fp->program.Base.Parameters->ParameterValues[src->Index];
564 GLfloat value = param[component];
565 if (src->Negate & (1 << channel))
566 value = -value;
567 if (src->Abs)
568 value = FABSF(value);
569 #if 0
570 printf(" form immed value %f for chan %d\n", value, channel);
571 #endif
572 return brw_imm_f(value);
573 }
574 else {
575 return get_src_reg(c, inst, srcRegIndex, channel);
576 }
577 }
578
579
580 /**
581 * Subroutines are minimal support for resusable instruction sequences.
582 * They are implemented as simply as possible to minimise overhead: there
583 * is no explicit support for communication between the caller and callee
584 * other than saving the return address in a temporary register, nor is
585 * there any automatic local storage. This implies that great care is
586 * required before attempting reentrancy or any kind of nested
587 * subroutine invocations.
588 */
589 static void invoke_subroutine( struct brw_wm_compile *c,
590 enum _subroutine subroutine,
591 void (*emit)( struct brw_wm_compile * ) )
592 {
593 struct brw_compile *p = &c->func;
594
595 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
596
597 if( c->subroutines[ subroutine ] ) {
598 /* subroutine previously emitted: reuse existing instructions */
599
600 int mark = mark_tmps( c );
601 struct brw_reg return_address = retype( alloc_tmp( c ),
602 BRW_REGISTER_TYPE_UD );
603 int here = p->nr_insn;
604
605 brw_push_insn_state(p);
606 brw_set_mask_control(p, BRW_MASK_DISABLE);
607 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
608
609 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
610 brw_imm_d( ( c->subroutines[ subroutine ] -
611 here - 1 ) << 4 ) );
612 brw_pop_insn_state(p);
613
614 release_tmps( c, mark );
615 } else {
616 /* previously unused subroutine: emit, and mark for later reuse */
617
618 int mark = mark_tmps( c );
619 struct brw_reg return_address = retype( alloc_tmp( c ),
620 BRW_REGISTER_TYPE_UD );
621 struct brw_instruction *calc;
622 int base = p->nr_insn;
623
624 brw_push_insn_state(p);
625 brw_set_mask_control(p, BRW_MASK_DISABLE);
626 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
627 brw_pop_insn_state(p);
628
629 c->subroutines[ subroutine ] = p->nr_insn;
630
631 emit( c );
632
633 brw_push_insn_state(p);
634 brw_set_mask_control(p, BRW_MASK_DISABLE);
635 brw_MOV( p, brw_ip_reg(), return_address );
636 brw_pop_insn_state(p);
637
638 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
639
640 release_tmps( c, mark );
641 }
642 }
643
644 static void emit_abs( struct brw_wm_compile *c,
645 const struct prog_instruction *inst)
646 {
647 int i;
648 struct brw_compile *p = &c->func;
649 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
650 for (i = 0; i < 4; i++) {
651 if (inst->DstReg.WriteMask & (1<<i)) {
652 struct brw_reg src, dst;
653 dst = get_dst_reg(c, inst, i);
654 src = get_src_reg(c, inst, 0, i);
655 brw_MOV(p, dst, brw_abs(src));
656 }
657 }
658 brw_set_saturate(p, 0);
659 }
660
661 static void emit_trunc( struct brw_wm_compile *c,
662 const struct prog_instruction *inst)
663 {
664 int i;
665 struct brw_compile *p = &c->func;
666 GLuint mask = inst->DstReg.WriteMask;
667 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
668 for (i = 0; i < 4; i++) {
669 if (mask & (1<<i)) {
670 struct brw_reg src, dst;
671 dst = get_dst_reg(c, inst, i);
672 src = get_src_reg(c, inst, 0, i);
673 brw_RNDZ(p, dst, src);
674 }
675 }
676 brw_set_saturate(p, 0);
677 }
678
679 static void emit_mov( struct brw_wm_compile *c,
680 const struct prog_instruction *inst)
681 {
682 int i;
683 struct brw_compile *p = &c->func;
684 GLuint mask = inst->DstReg.WriteMask;
685 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
686 for (i = 0; i < 4; i++) {
687 if (mask & (1<<i)) {
688 struct brw_reg src, dst;
689 dst = get_dst_reg(c, inst, i);
690 /* XXX some moves from immediate value don't work reliably!!! */
691 /*src = get_src_reg_imm(c, inst, 0, i);*/
692 src = get_src_reg(c, inst, 0, i);
693 brw_MOV(p, dst, src);
694 }
695 }
696 brw_set_saturate(p, 0);
697 }
698
699 static void emit_pixel_xy(struct brw_wm_compile *c,
700 const struct prog_instruction *inst)
701 {
702 struct brw_reg r1 = brw_vec1_grf(1, 0);
703 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
704
705 struct brw_reg dst0, dst1;
706 struct brw_compile *p = &c->func;
707 GLuint mask = inst->DstReg.WriteMask;
708
709 dst0 = get_dst_reg(c, inst, 0);
710 dst1 = get_dst_reg(c, inst, 1);
711 /* Calculate pixel centers by adding 1 or 0 to each of the
712 * micro-tile coordinates passed in r1.
713 */
714 if (mask & WRITEMASK_X) {
715 brw_ADD(p,
716 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
717 stride(suboffset(r1_uw, 4), 2, 4, 0),
718 brw_imm_v(0x10101010));
719 }
720
721 if (mask & WRITEMASK_Y) {
722 brw_ADD(p,
723 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
724 stride(suboffset(r1_uw, 5), 2, 4, 0),
725 brw_imm_v(0x11001100));
726 }
727 }
728
729 static void emit_delta_xy(struct brw_wm_compile *c,
730 const struct prog_instruction *inst)
731 {
732 struct brw_reg r1 = brw_vec1_grf(1, 0);
733 struct brw_reg dst0, dst1, src0, src1;
734 struct brw_compile *p = &c->func;
735 GLuint mask = inst->DstReg.WriteMask;
736
737 dst0 = get_dst_reg(c, inst, 0);
738 dst1 = get_dst_reg(c, inst, 1);
739 src0 = get_src_reg(c, inst, 0, 0);
740 src1 = get_src_reg(c, inst, 0, 1);
741 /* Calc delta X,Y by subtracting origin in r1 from the pixel
742 * centers.
743 */
744 if (mask & WRITEMASK_X) {
745 brw_ADD(p,
746 dst0,
747 retype(src0, BRW_REGISTER_TYPE_UW),
748 negate(r1));
749 }
750
751 if (mask & WRITEMASK_Y) {
752 brw_ADD(p,
753 dst1,
754 retype(src1, BRW_REGISTER_TYPE_UW),
755 negate(suboffset(r1,1)));
756
757 }
758 }
759
760 static void fire_fb_write( struct brw_wm_compile *c,
761 GLuint base_reg,
762 GLuint nr,
763 GLuint target,
764 GLuint eot)
765 {
766 struct brw_compile *p = &c->func;
767 /* Pass through control information:
768 */
769 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
770 {
771 brw_push_insn_state(p);
772 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
773 brw_MOV(p,
774 brw_message_reg(base_reg + 1),
775 brw_vec8_grf(1, 0));
776 brw_pop_insn_state(p);
777 }
778 /* Send framebuffer write message: */
779 brw_fb_WRITE(p,
780 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
781 base_reg,
782 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
783 target,
784 nr,
785 0,
786 eot);
787 }
788
789 static void emit_fb_write(struct brw_wm_compile *c,
790 const struct prog_instruction *inst)
791 {
792 struct brw_compile *p = &c->func;
793 int nr = 2;
794 int channel;
795 GLuint target, eot;
796 struct brw_reg src0;
797
798 /* Reserve a space for AA - may not be needed:
799 */
800 if (c->key.aa_dest_stencil_reg)
801 nr += 1;
802
803 brw_push_insn_state(p);
804 for (channel = 0; channel < 4; channel++) {
805 src0 = get_src_reg(c, inst, 0, channel);
806 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
807 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
808 brw_MOV(p, brw_message_reg(nr + channel), src0);
809 }
810 /* skip over the regs populated above: */
811 nr += 8;
812 brw_pop_insn_state(p);
813
814 if (c->key.source_depth_to_render_target) {
815 if (c->key.computes_depth) {
816 src0 = get_src_reg(c, inst, 2, 2);
817 brw_MOV(p, brw_message_reg(nr), src0);
818 }
819 else {
820 src0 = get_src_reg(c, inst, 1, 1);
821 brw_MOV(p, brw_message_reg(nr), src0);
822 }
823
824 nr += 2;
825 }
826
827 if (c->key.dest_depth_reg) {
828 const GLuint comp = c->key.dest_depth_reg / 2;
829 const GLuint off = c->key.dest_depth_reg % 2;
830
831 if (off != 0) {
832 /* XXX this code needs review/testing */
833 struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
834 struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
835
836 brw_push_insn_state(p);
837 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
838
839 brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
840 /* 2nd half? */
841 brw_MOV(p, brw_message_reg(nr+1), arg1_1);
842 brw_pop_insn_state(p);
843 }
844 else
845 {
846 struct brw_reg src = get_src_reg(c, inst, 1, 1);
847 brw_MOV(p, brw_message_reg(nr), src);
848 }
849 nr += 2;
850 }
851
852 target = inst->Aux >> 1;
853 eot = inst->Aux & 1;
854 fire_fb_write(c, 0, nr, target, eot);
855 }
856
857 static void emit_pixel_w( struct brw_wm_compile *c,
858 const struct prog_instruction *inst)
859 {
860 struct brw_compile *p = &c->func;
861 GLuint mask = inst->DstReg.WriteMask;
862 if (mask & WRITEMASK_W) {
863 struct brw_reg dst, src0, delta0, delta1;
864 struct brw_reg interp3;
865
866 dst = get_dst_reg(c, inst, 3);
867 src0 = get_src_reg(c, inst, 0, 0);
868 delta0 = get_src_reg(c, inst, 1, 0);
869 delta1 = get_src_reg(c, inst, 1, 1);
870
871 interp3 = brw_vec1_grf(src0.nr+1, 4);
872 /* Calc 1/w - just linterp wpos[3] optimized by putting the
873 * result straight into a message reg.
874 */
875 brw_LINE(p, brw_null_reg(), interp3, delta0);
876 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
877
878 /* Calc w */
879 brw_math_16( p, dst,
880 BRW_MATH_FUNCTION_INV,
881 BRW_MATH_SATURATE_NONE,
882 2, brw_null_reg(),
883 BRW_MATH_PRECISION_FULL);
884 }
885 }
886
887 static void emit_linterp(struct brw_wm_compile *c,
888 const struct prog_instruction *inst)
889 {
890 struct brw_compile *p = &c->func;
891 GLuint mask = inst->DstReg.WriteMask;
892 struct brw_reg interp[4];
893 struct brw_reg dst, delta0, delta1;
894 struct brw_reg src0;
895 GLuint nr, i;
896
897 src0 = get_src_reg(c, inst, 0, 0);
898 delta0 = get_src_reg(c, inst, 1, 0);
899 delta1 = get_src_reg(c, inst, 1, 1);
900 nr = src0.nr;
901
902 interp[0] = brw_vec1_grf(nr, 0);
903 interp[1] = brw_vec1_grf(nr, 4);
904 interp[2] = brw_vec1_grf(nr+1, 0);
905 interp[3] = brw_vec1_grf(nr+1, 4);
906
907 for(i = 0; i < 4; i++ ) {
908 if (mask & (1<<i)) {
909 dst = get_dst_reg(c, inst, i);
910 brw_LINE(p, brw_null_reg(), interp[i], delta0);
911 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
912 }
913 }
914 }
915
916 static void emit_cinterp(struct brw_wm_compile *c,
917 const struct prog_instruction *inst)
918 {
919 struct brw_compile *p = &c->func;
920 GLuint mask = inst->DstReg.WriteMask;
921
922 struct brw_reg interp[4];
923 struct brw_reg dst, src0;
924 GLuint nr, i;
925
926 src0 = get_src_reg(c, inst, 0, 0);
927 nr = src0.nr;
928
929 interp[0] = brw_vec1_grf(nr, 0);
930 interp[1] = brw_vec1_grf(nr, 4);
931 interp[2] = brw_vec1_grf(nr+1, 0);
932 interp[3] = brw_vec1_grf(nr+1, 4);
933
934 for(i = 0; i < 4; i++ ) {
935 if (mask & (1<<i)) {
936 dst = get_dst_reg(c, inst, i);
937 brw_MOV(p, dst, suboffset(interp[i],3));
938 }
939 }
940 }
941
942 static void emit_pinterp(struct brw_wm_compile *c,
943 const struct prog_instruction *inst)
944 {
945 struct brw_compile *p = &c->func;
946 GLuint mask = inst->DstReg.WriteMask;
947
948 struct brw_reg interp[4];
949 struct brw_reg dst, delta0, delta1;
950 struct brw_reg src0, w;
951 GLuint nr, i;
952
953 src0 = get_src_reg(c, inst, 0, 0);
954 delta0 = get_src_reg(c, inst, 1, 0);
955 delta1 = get_src_reg(c, inst, 1, 1);
956 w = get_src_reg(c, inst, 2, 3);
957 nr = src0.nr;
958
959 interp[0] = brw_vec1_grf(nr, 0);
960 interp[1] = brw_vec1_grf(nr, 4);
961 interp[2] = brw_vec1_grf(nr+1, 0);
962 interp[3] = brw_vec1_grf(nr+1, 4);
963
964 for(i = 0; i < 4; i++ ) {
965 if (mask & (1<<i)) {
966 dst = get_dst_reg(c, inst, i);
967 brw_LINE(p, brw_null_reg(), interp[i], delta0);
968 brw_MAC(p, dst, suboffset(interp[i],1),
969 delta1);
970 brw_MUL(p, dst, dst, w);
971 }
972 }
973 }
974
975 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
976 static void emit_frontfacing(struct brw_wm_compile *c,
977 const struct prog_instruction *inst)
978 {
979 struct brw_compile *p = &c->func;
980 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
981 struct brw_reg dst;
982 GLuint mask = inst->DstReg.WriteMask;
983 int i;
984
985 for (i = 0; i < 4; i++) {
986 if (mask & (1<<i)) {
987 dst = get_dst_reg(c, inst, i);
988 brw_MOV(p, dst, brw_imm_f(0.0));
989 }
990 }
991
992 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
993 * us front face
994 */
995 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
996 for (i = 0; i < 4; i++) {
997 if (mask & (1<<i)) {
998 dst = get_dst_reg(c, inst, i);
999 brw_MOV(p, dst, brw_imm_f(1.0));
1000 }
1001 }
1002 brw_set_predicate_control_flag_value(p, 0xff);
1003 }
1004
1005 static void emit_xpd(struct brw_wm_compile *c,
1006 const struct prog_instruction *inst)
1007 {
1008 int i;
1009 struct brw_compile *p = &c->func;
1010 GLuint mask = inst->DstReg.WriteMask;
1011 for (i = 0; i < 4; i++) {
1012 GLuint i2 = (i+2)%3;
1013 GLuint i1 = (i+1)%3;
1014 if (mask & (1<<i)) {
1015 struct brw_reg src0, src1, dst;
1016 dst = get_dst_reg(c, inst, i);
1017 src0 = negate(get_src_reg(c, inst, 0, i2));
1018 src1 = get_src_reg_imm(c, inst, 1, i1);
1019 brw_MUL(p, brw_null_reg(), src0, src1);
1020 src0 = get_src_reg(c, inst, 0, i1);
1021 src1 = get_src_reg_imm(c, inst, 1, i2);
1022 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1023 brw_MAC(p, dst, src0, src1);
1024 brw_set_saturate(p, 0);
1025 }
1026 }
1027 brw_set_saturate(p, 0);
1028 }
1029
1030 static void emit_dp3(struct brw_wm_compile *c,
1031 const struct prog_instruction *inst)
1032 {
1033 struct brw_reg src0[3], src1[3], dst;
1034 int i;
1035 struct brw_compile *p = &c->func;
1036 for (i = 0; i < 3; i++) {
1037 src0[i] = get_src_reg(c, inst, 0, i);
1038 src1[i] = get_src_reg_imm(c, inst, 1, i);
1039 }
1040
1041 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1042 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1043 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1044 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1045 brw_MAC(p, dst, src0[2], src1[2]);
1046 brw_set_saturate(p, 0);
1047 }
1048
1049 static void emit_dp4(struct brw_wm_compile *c,
1050 const struct prog_instruction *inst)
1051 {
1052 struct brw_reg src0[4], src1[4], dst;
1053 int i;
1054 struct brw_compile *p = &c->func;
1055 for (i = 0; i < 4; i++) {
1056 src0[i] = get_src_reg(c, inst, 0, i);
1057 src1[i] = get_src_reg_imm(c, inst, 1, i);
1058 }
1059 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1060 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1061 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1062 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1063 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1064 brw_MAC(p, dst, src0[3], src1[3]);
1065 brw_set_saturate(p, 0);
1066 }
1067
1068 static void emit_dph(struct brw_wm_compile *c,
1069 const struct prog_instruction *inst)
1070 {
1071 struct brw_reg src0[4], src1[4], dst;
1072 int i;
1073 struct brw_compile *p = &c->func;
1074 for (i = 0; i < 4; i++) {
1075 src0[i] = get_src_reg(c, inst, 0, i);
1076 src1[i] = get_src_reg_imm(c, inst, 1, i);
1077 }
1078 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1079 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1080 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1081 brw_MAC(p, dst, src0[2], src1[2]);
1082 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1083 brw_ADD(p, dst, dst, src1[3]);
1084 brw_set_saturate(p, 0);
1085 }
1086
1087 /**
1088 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1089 * Note that the result of the function is smeared across the dest
1090 * register's X, Y, Z and W channels (subject to writemasking of course).
1091 */
1092 static void emit_math1(struct brw_wm_compile *c,
1093 const struct prog_instruction *inst, GLuint func)
1094 {
1095 struct brw_compile *p = &c->func;
1096 struct brw_reg src0, dst, tmp;
1097 const int mark = mark_tmps( c );
1098 int i;
1099
1100 tmp = alloc_tmp(c);
1101
1102 /* Get first component of source register */
1103 src0 = get_src_reg(c, inst, 0, 0);
1104
1105 /* tmp = func(src0) */
1106 brw_MOV(p, brw_message_reg(2), src0);
1107 brw_math(p,
1108 tmp,
1109 func,
1110 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1111 2,
1112 brw_null_reg(),
1113 BRW_MATH_DATA_VECTOR,
1114 BRW_MATH_PRECISION_FULL);
1115
1116 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
1117
1118 /* replicate tmp value across enabled dest channels */
1119 for (i = 0; i < 4; i++) {
1120 if (inst->DstReg.WriteMask & (1 << i)) {
1121 dst = get_dst_reg(c, inst, i);
1122 brw_MOV(p, dst, tmp);
1123 }
1124 }
1125
1126 release_tmps(c, mark);
1127 }
1128
1129 static void emit_rcp(struct brw_wm_compile *c,
1130 const struct prog_instruction *inst)
1131 {
1132 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1133 }
1134
1135 static void emit_rsq(struct brw_wm_compile *c,
1136 const struct prog_instruction *inst)
1137 {
1138 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1139 }
1140
1141 static void emit_sin(struct brw_wm_compile *c,
1142 const struct prog_instruction *inst)
1143 {
1144 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1145 }
1146
1147 static void emit_cos(struct brw_wm_compile *c,
1148 const struct prog_instruction *inst)
1149 {
1150 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1151 }
1152
1153 static void emit_ex2(struct brw_wm_compile *c,
1154 const struct prog_instruction *inst)
1155 {
1156 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1157 }
1158
1159 static void emit_lg2(struct brw_wm_compile *c,
1160 const struct prog_instruction *inst)
1161 {
1162 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1163 }
1164
1165 static void emit_add(struct brw_wm_compile *c,
1166 const struct prog_instruction *inst)
1167 {
1168 struct brw_compile *p = &c->func;
1169 struct brw_reg src0, src1, dst;
1170 GLuint mask = inst->DstReg.WriteMask;
1171 int i;
1172 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1173 for (i = 0 ; i < 4; i++) {
1174 if (mask & (1<<i)) {
1175 dst = get_dst_reg(c, inst, i);
1176 src0 = get_src_reg(c, inst, 0, i);
1177 src1 = get_src_reg_imm(c, inst, 1, i);
1178 brw_ADD(p, dst, src0, src1);
1179 }
1180 }
1181 brw_set_saturate(p, 0);
1182 }
1183
1184 static void emit_arl(struct brw_wm_compile *c,
1185 const struct prog_instruction *inst)
1186 {
1187 struct brw_compile *p = &c->func;
1188 struct brw_reg src0, addr_reg;
1189 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1190 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1191 BRW_ARF_ADDRESS, 0);
1192 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1193 brw_MOV(p, addr_reg, src0);
1194 brw_set_saturate(p, 0);
1195 }
1196
1197 static void emit_sub(struct brw_wm_compile *c,
1198 const struct prog_instruction *inst)
1199 {
1200 struct brw_compile *p = &c->func;
1201 struct brw_reg src0, src1, dst;
1202 GLuint mask = inst->DstReg.WriteMask;
1203 int i;
1204 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1205 for (i = 0 ; i < 4; i++) {
1206 if (mask & (1<<i)) {
1207 dst = get_dst_reg(c, inst, i);
1208 src0 = get_src_reg(c, inst, 0, i);
1209 src1 = get_src_reg_imm(c, inst, 1, i);
1210 brw_ADD(p, dst, src0, negate(src1));
1211 }
1212 }
1213 brw_set_saturate(p, 0);
1214 }
1215
1216 static void emit_mul(struct brw_wm_compile *c,
1217 const struct prog_instruction *inst)
1218 {
1219 struct brw_compile *p = &c->func;
1220 struct brw_reg src0, src1, dst;
1221 GLuint mask = inst->DstReg.WriteMask;
1222 int i;
1223 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1224 for (i = 0 ; i < 4; i++) {
1225 if (mask & (1<<i)) {
1226 dst = get_dst_reg(c, inst, i);
1227 src0 = get_src_reg(c, inst, 0, i);
1228 src1 = get_src_reg_imm(c, inst, 1, i);
1229 brw_MUL(p, dst, src0, src1);
1230 }
1231 }
1232 brw_set_saturate(p, 0);
1233 }
1234
1235 static void emit_frc(struct brw_wm_compile *c,
1236 const struct prog_instruction *inst)
1237 {
1238 struct brw_compile *p = &c->func;
1239 struct brw_reg src0, dst;
1240 GLuint mask = inst->DstReg.WriteMask;
1241 int i;
1242 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1243 for (i = 0 ; i < 4; i++) {
1244 if (mask & (1<<i)) {
1245 dst = get_dst_reg(c, inst, i);
1246 src0 = get_src_reg_imm(c, inst, 0, i);
1247 brw_FRC(p, dst, src0);
1248 }
1249 }
1250 if (inst->SaturateMode != SATURATE_OFF)
1251 brw_set_saturate(p, 0);
1252 }
1253
1254 static void emit_flr(struct brw_wm_compile *c,
1255 const struct prog_instruction *inst)
1256 {
1257 struct brw_compile *p = &c->func;
1258 struct brw_reg src0, dst;
1259 GLuint mask = inst->DstReg.WriteMask;
1260 int i;
1261 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1262 for (i = 0 ; i < 4; i++) {
1263 if (mask & (1<<i)) {
1264 dst = get_dst_reg(c, inst, i);
1265 src0 = get_src_reg_imm(c, inst, 0, i);
1266 brw_RNDD(p, dst, src0);
1267 }
1268 }
1269 brw_set_saturate(p, 0);
1270 }
1271
1272
1273 static void emit_min_max(struct brw_wm_compile *c,
1274 const struct prog_instruction *inst)
1275 {
1276 struct brw_compile *p = &c->func;
1277 const GLuint mask = inst->DstReg.WriteMask;
1278 const int mark = mark_tmps(c);
1279 int i;
1280 brw_push_insn_state(p);
1281 for (i = 0; i < 4; i++) {
1282 if (mask & (1<<i)) {
1283 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1284 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1285 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1286 struct brw_reg dst;
1287 /* if dst==src0 or dst==src1 we need to use a temp reg */
1288 GLboolean use_temp = brw_same_reg(dst, src0) ||
1289 brw_same_reg(dst, src1);
1290 if (use_temp)
1291 dst = alloc_tmp(c);
1292 else
1293 dst = real_dst;
1294
1295 /*
1296 printf(" Min/max: dst %d src0 %d src1 %d\n",
1297 dst.nr, src0.nr, src1.nr);
1298 */
1299 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1300 brw_MOV(p, dst, src0);
1301 brw_set_saturate(p, 0);
1302
1303 if (inst->Opcode == OPCODE_MIN)
1304 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1305 else
1306 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1307
1308 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1309 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1310 brw_MOV(p, dst, src1);
1311 brw_set_saturate(p, 0);
1312 brw_set_predicate_control_flag_value(p, 0xff);
1313 if (use_temp)
1314 brw_MOV(p, real_dst, dst);
1315 }
1316 }
1317 brw_pop_insn_state(p);
1318 release_tmps(c, mark);
1319 }
1320
1321 static void emit_pow(struct brw_wm_compile *c,
1322 const struct prog_instruction *inst)
1323 {
1324 struct brw_compile *p = &c->func;
1325 struct brw_reg dst, src0, src1;
1326 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
1327 src0 = get_src_reg_imm(c, inst, 0, 0);
1328 src1 = get_src_reg_imm(c, inst, 1, 0);
1329
1330 brw_MOV(p, brw_message_reg(2), src0);
1331 brw_MOV(p, brw_message_reg(3), src1);
1332
1333 brw_math(p,
1334 dst,
1335 BRW_MATH_FUNCTION_POW,
1336 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1337 2,
1338 brw_null_reg(),
1339 BRW_MATH_DATA_VECTOR,
1340 BRW_MATH_PRECISION_FULL);
1341 }
1342
1343 static void emit_lrp(struct brw_wm_compile *c,
1344 const struct prog_instruction *inst)
1345 {
1346 struct brw_compile *p = &c->func;
1347 GLuint mask = inst->DstReg.WriteMask;
1348 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1349 int i;
1350 int mark = mark_tmps(c);
1351 for (i = 0; i < 4; i++) {
1352 if (mask & (1<<i)) {
1353 dst = get_dst_reg(c, inst, i);
1354 src0 = get_src_reg(c, inst, 0, i);
1355
1356 src1 = get_src_reg_imm(c, inst, 1, i);
1357
1358 if (src1.nr == dst.nr) {
1359 tmp1 = alloc_tmp(c);
1360 brw_MOV(p, tmp1, src1);
1361 } else
1362 tmp1 = src1;
1363
1364 src2 = get_src_reg(c, inst, 2, i);
1365 if (src2.nr == dst.nr) {
1366 tmp2 = alloc_tmp(c);
1367 brw_MOV(p, tmp2, src2);
1368 } else
1369 tmp2 = src2;
1370
1371 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1372 brw_MUL(p, brw_null_reg(), dst, tmp2);
1373 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1374 brw_MAC(p, dst, src0, tmp1);
1375 brw_set_saturate(p, 0);
1376 }
1377 release_tmps(c, mark);
1378 }
1379 }
1380
1381 /**
1382 * For GLSL shaders, this KIL will be unconditional.
1383 * It may be contained inside an IF/ENDIF structure of course.
1384 */
1385 static void emit_kil(struct brw_wm_compile *c)
1386 {
1387 struct brw_compile *p = &c->func;
1388 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1389 brw_push_insn_state(p);
1390 brw_set_mask_control(p, BRW_MASK_DISABLE);
1391 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1392 brw_AND(p, depth, c->emit_mask_reg, depth);
1393 brw_pop_insn_state(p);
1394 }
1395
1396 static void emit_mad(struct brw_wm_compile *c,
1397 const struct prog_instruction *inst)
1398 {
1399 struct brw_compile *p = &c->func;
1400 GLuint mask = inst->DstReg.WriteMask;
1401 struct brw_reg dst, src0, src1, src2;
1402 int i;
1403
1404 for (i = 0; i < 4; i++) {
1405 if (mask & (1<<i)) {
1406 dst = get_dst_reg(c, inst, i);
1407 src0 = get_src_reg(c, inst, 0, i);
1408 src1 = get_src_reg_imm(c, inst, 1, i);
1409 src2 = get_src_reg_imm(c, inst, 2, i);
1410 brw_MUL(p, dst, src0, src1);
1411
1412 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1413 brw_ADD(p, dst, dst, src2);
1414 brw_set_saturate(p, 0);
1415 }
1416 }
1417 }
1418
1419 static void emit_sop(struct brw_wm_compile *c,
1420 const struct prog_instruction *inst, GLuint cond)
1421 {
1422 struct brw_compile *p = &c->func;
1423 GLuint mask = inst->DstReg.WriteMask;
1424 struct brw_reg dst, src0, src1;
1425 int i;
1426
1427 for (i = 0; i < 4; i++) {
1428 if (mask & (1<<i)) {
1429 dst = get_dst_reg(c, inst, i);
1430 src0 = get_src_reg(c, inst, 0, i);
1431 src1 = get_src_reg_imm(c, inst, 1, i);
1432 brw_push_insn_state(p);
1433 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1434 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1435 brw_MOV(p, dst, brw_imm_f(0.0));
1436 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1437 brw_MOV(p, dst, brw_imm_f(1.0));
1438 brw_pop_insn_state(p);
1439 }
1440 }
1441 }
1442
1443 static void emit_slt(struct brw_wm_compile *c,
1444 const struct prog_instruction *inst)
1445 {
1446 emit_sop(c, inst, BRW_CONDITIONAL_L);
1447 }
1448
1449 static void emit_sle(struct brw_wm_compile *c,
1450 const struct prog_instruction *inst)
1451 {
1452 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1453 }
1454
1455 static void emit_sgt(struct brw_wm_compile *c,
1456 const struct prog_instruction *inst)
1457 {
1458 emit_sop(c, inst, BRW_CONDITIONAL_G);
1459 }
1460
1461 static void emit_sge(struct brw_wm_compile *c,
1462 const struct prog_instruction *inst)
1463 {
1464 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1465 }
1466
1467 static void emit_seq(struct brw_wm_compile *c,
1468 const struct prog_instruction *inst)
1469 {
1470 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1471 }
1472
1473 static void emit_sne(struct brw_wm_compile *c,
1474 const struct prog_instruction *inst)
1475 {
1476 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1477 }
1478
1479 static void emit_ddx(struct brw_wm_compile *c,
1480 const struct prog_instruction *inst)
1481 {
1482 struct brw_compile *p = &c->func;
1483 GLuint mask = inst->DstReg.WriteMask;
1484 struct brw_reg interp[4];
1485 struct brw_reg dst;
1486 struct brw_reg src0, w;
1487 GLuint nr, i;
1488 src0 = get_src_reg(c, inst, 0, 0);
1489 w = get_src_reg(c, inst, 1, 3);
1490 nr = src0.nr;
1491 interp[0] = brw_vec1_grf(nr, 0);
1492 interp[1] = brw_vec1_grf(nr, 4);
1493 interp[2] = brw_vec1_grf(nr+1, 0);
1494 interp[3] = brw_vec1_grf(nr+1, 4);
1495 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1496 for(i = 0; i < 4; i++ ) {
1497 if (mask & (1<<i)) {
1498 dst = get_dst_reg(c, inst, i);
1499 brw_MOV(p, dst, interp[i]);
1500 brw_MUL(p, dst, dst, w);
1501 }
1502 }
1503 brw_set_saturate(p, 0);
1504 }
1505
1506 static void emit_ddy(struct brw_wm_compile *c,
1507 const struct prog_instruction *inst)
1508 {
1509 struct brw_compile *p = &c->func;
1510 GLuint mask = inst->DstReg.WriteMask;
1511 struct brw_reg interp[4];
1512 struct brw_reg dst;
1513 struct brw_reg src0, w;
1514 GLuint nr, i;
1515
1516 src0 = get_src_reg(c, inst, 0, 0);
1517 nr = src0.nr;
1518 w = get_src_reg(c, inst, 1, 3);
1519 interp[0] = brw_vec1_grf(nr, 0);
1520 interp[1] = brw_vec1_grf(nr, 4);
1521 interp[2] = brw_vec1_grf(nr+1, 0);
1522 interp[3] = brw_vec1_grf(nr+1, 4);
1523 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1524 for(i = 0; i < 4; i++ ) {
1525 if (mask & (1<<i)) {
1526 dst = get_dst_reg(c, inst, i);
1527 brw_MOV(p, dst, suboffset(interp[i], 1));
1528 brw_MUL(p, dst, dst, w);
1529 }
1530 }
1531 brw_set_saturate(p, 0);
1532 }
1533
1534 static INLINE struct brw_reg high_words( struct brw_reg reg )
1535 {
1536 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1537 0, 8, 2 );
1538 }
1539
1540 static INLINE struct brw_reg low_words( struct brw_reg reg )
1541 {
1542 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1543 }
1544
1545 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1546 {
1547 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1548 }
1549
1550 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1551 {
1552 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1553 0, 16, 2 );
1554 }
1555
1556 /* One-, two- and three-dimensional Perlin noise, similar to the description
1557 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1558 static void noise1_sub( struct brw_wm_compile *c ) {
1559
1560 struct brw_compile *p = &c->func;
1561 struct brw_reg param,
1562 x0, x1, /* gradients at each end */
1563 t, tmp[ 2 ], /* float temporaries */
1564 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1565 int i;
1566 int mark = mark_tmps( c );
1567
1568 x0 = alloc_tmp( c );
1569 x1 = alloc_tmp( c );
1570 t = alloc_tmp( c );
1571 tmp[ 0 ] = alloc_tmp( c );
1572 tmp[ 1 ] = alloc_tmp( c );
1573 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1574 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1575 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1576 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1577 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1578
1579 param = lookup_tmp( c, mark - 2 );
1580
1581 brw_set_access_mode( p, BRW_ALIGN_1 );
1582
1583 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1584
1585 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1586 be hashed. Also compute the remainder (offset within the unit
1587 length), interleaved to reduce register dependency penalties. */
1588 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1589 brw_FRC( p, param, param );
1590 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1591 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1592 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1593
1594 /* We're now ready to perform the hashing. The two hashes are
1595 interleaved for performance. The hash function used is
1596 designed to rapidly achieve avalanche and require only 32x16
1597 bit multiplication, and 16-bit swizzles (which we get for
1598 free). We can't use immediate operands in the multiplies,
1599 because immediates are permitted only in src1 and the 16-bit
1600 factor is permitted only in src0. */
1601 for( i = 0; i < 2; i++ )
1602 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1603 for( i = 0; i < 2; i++ )
1604 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1605 high_words( itmp[ i ] ) );
1606 for( i = 0; i < 2; i++ )
1607 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1608 for( i = 0; i < 2; i++ )
1609 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1610 high_words( itmp[ i ] ) );
1611 for( i = 0; i < 2; i++ )
1612 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1613 for( i = 0; i < 2; i++ )
1614 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1615 high_words( itmp[ i ] ) );
1616
1617 /* Now we want to initialise the two gradients based on the
1618 hashes. Format conversion from signed integer to float leaves
1619 everything scaled too high by a factor of pow( 2, 31 ), but
1620 we correct for that right at the end. */
1621 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1622 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1623 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1624
1625 brw_MUL( p, x0, x0, param );
1626 brw_MUL( p, x1, x1, t );
1627
1628 /* We interpolate between the gradients using the polynomial
1629 6t^5 - 15t^4 + 10t^3 (Perlin). */
1630 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1631 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1632 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1633 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1634 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1635 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1636 pipeline */
1637 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1638 brw_MUL( p, param, tmp[ 0 ], param );
1639 brw_MUL( p, x1, x1, param );
1640 brw_ADD( p, x0, x0, x1 );
1641 /* scale by pow( 2, -30 ), to compensate for the format conversion
1642 above and an extra factor of 2 so that a single gradient covers
1643 the [-1,1] range */
1644 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1645
1646 release_tmps( c, mark );
1647 }
1648
1649 static void emit_noise1( struct brw_wm_compile *c,
1650 const struct prog_instruction *inst )
1651 {
1652 struct brw_compile *p = &c->func;
1653 struct brw_reg src, param, dst;
1654 GLuint mask = inst->DstReg.WriteMask;
1655 int i;
1656 int mark = mark_tmps( c );
1657
1658 assert( mark == 0 );
1659
1660 src = get_src_reg( c, inst, 0, 0 );
1661
1662 param = alloc_tmp( c );
1663
1664 brw_MOV( p, param, src );
1665
1666 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1667
1668 /* Fill in the result: */
1669 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1670 for (i = 0 ; i < 4; i++) {
1671 if (mask & (1<<i)) {
1672 dst = get_dst_reg(c, inst, i);
1673 brw_MOV( p, dst, param );
1674 }
1675 }
1676 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1677 brw_set_saturate( p, 0 );
1678
1679 release_tmps( c, mark );
1680 }
1681
1682 static void noise2_sub( struct brw_wm_compile *c ) {
1683
1684 struct brw_compile *p = &c->func;
1685 struct brw_reg param0, param1,
1686 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1687 t, tmp[ 4 ], /* float temporaries */
1688 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1689 int i;
1690 int mark = mark_tmps( c );
1691
1692 x0y0 = alloc_tmp( c );
1693 x0y1 = alloc_tmp( c );
1694 x1y0 = alloc_tmp( c );
1695 x1y1 = alloc_tmp( c );
1696 t = alloc_tmp( c );
1697 for( i = 0; i < 4; i++ ) {
1698 tmp[ i ] = alloc_tmp( c );
1699 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1700 }
1701 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1702 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1703 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1704
1705 param0 = lookup_tmp( c, mark - 3 );
1706 param1 = lookup_tmp( c, mark - 2 );
1707
1708 brw_set_access_mode( p, BRW_ALIGN_1 );
1709
1710 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1711 be hashed. Also compute the remainders (offsets within the unit
1712 square), interleaved to reduce register dependency penalties. */
1713 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1714 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1715 brw_FRC( p, param0, param0 );
1716 brw_FRC( p, param1, param1 );
1717 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1718 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1719 low_words( itmp[ 1 ] ) );
1720 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1721 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1722 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1723 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1724 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1725
1726 /* We're now ready to perform the hashing. The four hashes are
1727 interleaved for performance. The hash function used is
1728 designed to rapidly achieve avalanche and require only 32x16
1729 bit multiplication, and 16-bit swizzles (which we get for
1730 free). We can't use immediate operands in the multiplies,
1731 because immediates are permitted only in src1 and the 16-bit
1732 factor is permitted only in src0. */
1733 for( i = 0; i < 4; i++ )
1734 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1735 for( i = 0; i < 4; i++ )
1736 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1737 high_words( itmp[ i ] ) );
1738 for( i = 0; i < 4; i++ )
1739 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1740 for( i = 0; i < 4; i++ )
1741 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1742 high_words( itmp[ i ] ) );
1743 for( i = 0; i < 4; i++ )
1744 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1745 for( i = 0; i < 4; i++ )
1746 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1747 high_words( itmp[ i ] ) );
1748
1749 /* Now we want to initialise the four gradients based on the
1750 hashes. Format conversion from signed integer to float leaves
1751 everything scaled too high by a factor of pow( 2, 15 ), but
1752 we correct for that right at the end. */
1753 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1754 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1755 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1756 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1757 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1758
1759 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1760 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1761 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1762 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1763
1764 brw_MUL( p, x1y0, x1y0, t );
1765 brw_MUL( p, x1y1, x1y1, t );
1766 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1767 brw_MUL( p, x0y0, x0y0, param0 );
1768 brw_MUL( p, x0y1, x0y1, param0 );
1769
1770 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1771 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1772 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1773 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1774
1775 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1776 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1777 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1778 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1779
1780 /* We interpolate between the gradients using the polynomial
1781 6t^5 - 15t^4 + 10t^3 (Perlin). */
1782 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1783 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1784 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1785 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1786 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1787 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1788 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1789 pipeline */
1790 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1791 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1792 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1793 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1794 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1795 pipeline */
1796 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1797 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1798 brw_MUL( p, param0, tmp[ 0 ], param0 );
1799 brw_MUL( p, param1, tmp[ 1 ], param1 );
1800
1801 /* Here we interpolate in the y dimension... */
1802 brw_MUL( p, x0y1, x0y1, param1 );
1803 brw_MUL( p, x1y1, x1y1, param1 );
1804 brw_ADD( p, x0y0, x0y0, x0y1 );
1805 brw_ADD( p, x1y0, x1y0, x1y1 );
1806
1807 /* And now in x. There are horrible register dependencies here,
1808 but we have nothing else to do. */
1809 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1810 brw_MUL( p, x1y0, x1y0, param0 );
1811 brw_ADD( p, x0y0, x0y0, x1y0 );
1812
1813 /* scale by pow( 2, -15 ), as described above */
1814 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1815
1816 release_tmps( c, mark );
1817 }
1818
1819 static void emit_noise2( struct brw_wm_compile *c,
1820 const struct prog_instruction *inst )
1821 {
1822 struct brw_compile *p = &c->func;
1823 struct brw_reg src0, src1, param0, param1, dst;
1824 GLuint mask = inst->DstReg.WriteMask;
1825 int i;
1826 int mark = mark_tmps( c );
1827
1828 assert( mark == 0 );
1829
1830 src0 = get_src_reg( c, inst, 0, 0 );
1831 src1 = get_src_reg( c, inst, 0, 1 );
1832
1833 param0 = alloc_tmp( c );
1834 param1 = alloc_tmp( c );
1835
1836 brw_MOV( p, param0, src0 );
1837 brw_MOV( p, param1, src1 );
1838
1839 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1840
1841 /* Fill in the result: */
1842 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1843 for (i = 0 ; i < 4; i++) {
1844 if (mask & (1<<i)) {
1845 dst = get_dst_reg(c, inst, i);
1846 brw_MOV( p, dst, param0 );
1847 }
1848 }
1849 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1850 brw_set_saturate( p, 0 );
1851
1852 release_tmps( c, mark );
1853 }
1854
1855 /**
1856 * The three-dimensional case is much like the one- and two- versions above,
1857 * but since the number of corners is rapidly growing we now pack 16 16-bit
1858 * hashes into each register to extract more parallelism from the EUs.
1859 */
1860 static void noise3_sub( struct brw_wm_compile *c ) {
1861
1862 struct brw_compile *p = &c->func;
1863 struct brw_reg param0, param1, param2,
1864 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1865 xi, yi, zi, /* interpolation coefficients */
1866 t, tmp[ 8 ], /* float temporaries */
1867 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1868 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1869 int i;
1870 int mark = mark_tmps( c );
1871
1872 x0y0 = alloc_tmp( c );
1873 x0y1 = alloc_tmp( c );
1874 x1y0 = alloc_tmp( c );
1875 x1y1 = alloc_tmp( c );
1876 xi = alloc_tmp( c );
1877 yi = alloc_tmp( c );
1878 zi = alloc_tmp( c );
1879 t = alloc_tmp( c );
1880 for( i = 0; i < 8; i++ ) {
1881 tmp[ i ] = alloc_tmp( c );
1882 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1883 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1884 }
1885
1886 param0 = lookup_tmp( c, mark - 4 );
1887 param1 = lookup_tmp( c, mark - 3 );
1888 param2 = lookup_tmp( c, mark - 2 );
1889
1890 brw_set_access_mode( p, BRW_ALIGN_1 );
1891
1892 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1893 be hashed. Also compute the remainders (offsets within the unit
1894 cube), interleaved to reduce register dependency penalties. */
1895 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1896 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1897 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1898 brw_FRC( p, param0, param0 );
1899 brw_FRC( p, param1, param1 );
1900 brw_FRC( p, param2, param2 );
1901 /* Since we now have only 16 bits of precision in the hash, we must
1902 be more careful about thorough mixing to maintain entropy as we
1903 squash the input vector into a small scalar. */
1904 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1905 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1906 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1907 brw_imm_uw( 0x9B93 ) );
1908 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1909 brw_imm_uw( 0xBC8F ) );
1910
1911 /* Temporarily disable the execution mask while we work with ExecSize=16
1912 channels (the mask is set for ExecSize=8 and is probably incorrect).
1913 Although this might cause execution of unwanted channels, the code
1914 writes only to temporary registers and has no side effects, so
1915 disabling the mask is harmless. */
1916 brw_push_insn_state( p );
1917 brw_set_mask_control( p, BRW_MASK_DISABLE );
1918 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1919 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1920 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1921
1922 /* We're now ready to perform the hashing. The eight hashes are
1923 interleaved for performance. The hash function used is
1924 designed to rapidly achieve avalanche and require only 16x16
1925 bit multiplication, and 8-bit swizzles (which we get for
1926 free). */
1927 for( i = 0; i < 4; i++ )
1928 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1929 for( i = 0; i < 4; i++ )
1930 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1931 odd_bytes( wtmp[ i ] ) );
1932 for( i = 0; i < 4; i++ )
1933 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1934 for( i = 0; i < 4; i++ )
1935 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1936 odd_bytes( wtmp[ i ] ) );
1937 brw_pop_insn_state( p );
1938
1939 /* Now we want to initialise the four rear gradients based on the
1940 hashes. Format conversion from signed integer to float leaves
1941 everything scaled too high by a factor of pow( 2, 15 ), but
1942 we correct for that right at the end. */
1943 /* x component */
1944 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1945 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1946 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1947 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1948 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1949
1950 brw_push_insn_state( p );
1951 brw_set_mask_control( p, BRW_MASK_DISABLE );
1952 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1953 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1954 brw_pop_insn_state( p );
1955
1956 brw_MUL( p, x1y0, x1y0, t );
1957 brw_MUL( p, x1y1, x1y1, t );
1958 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1959 brw_MUL( p, x0y0, x0y0, param0 );
1960 brw_MUL( p, x0y1, x0y1, param0 );
1961
1962 /* y component */
1963 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1964 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1965 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1966 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1967
1968 brw_push_insn_state( p );
1969 brw_set_mask_control( p, BRW_MASK_DISABLE );
1970 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1971 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1972 brw_pop_insn_state( p );
1973
1974 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1975 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1976 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1977 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1978 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1979
1980 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1981 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1982 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1983 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1984
1985 /* z component */
1986 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1987 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1988 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1989 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1990
1991 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1992 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1993 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1994 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1995
1996 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1997 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1998 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1999 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2000
2001 /* We interpolate between the gradients using the polynomial
2002 6t^5 - 15t^4 + 10t^3 (Perlin). */
2003 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
2004 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
2005 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
2006 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
2007 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
2008 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
2009 brw_MUL( p, xi, xi, param0 );
2010 brw_MUL( p, yi, yi, param1 );
2011 brw_MUL( p, zi, zi, param2 );
2012 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
2013 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
2014 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
2015 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
2016 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
2017 brw_MUL( p, xi, xi, param0 );
2018 brw_MUL( p, yi, yi, param1 );
2019 brw_MUL( p, zi, zi, param2 );
2020 brw_MUL( p, xi, xi, param0 );
2021 brw_MUL( p, yi, yi, param1 );
2022 brw_MUL( p, zi, zi, param2 );
2023 brw_MUL( p, xi, xi, param0 );
2024 brw_MUL( p, yi, yi, param1 );
2025 brw_MUL( p, zi, zi, param2 );
2026
2027 /* Here we interpolate in the y dimension... */
2028 brw_MUL( p, x0y1, x0y1, yi );
2029 brw_MUL( p, x1y1, x1y1, yi );
2030 brw_ADD( p, x0y0, x0y0, x0y1 );
2031 brw_ADD( p, x1y0, x1y0, x1y1 );
2032
2033 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2034 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2035 brw_MUL( p, x1y0, x1y0, xi );
2036 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2037
2038 /* Now do the same thing for the front four gradients... */
2039 /* x component */
2040 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2041 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2042 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2043 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2044
2045 brw_push_insn_state( p );
2046 brw_set_mask_control( p, BRW_MASK_DISABLE );
2047 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2048 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2049 brw_pop_insn_state( p );
2050
2051 brw_MUL( p, x1y0, x1y0, t );
2052 brw_MUL( p, x1y1, x1y1, t );
2053 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
2054 brw_MUL( p, x0y0, x0y0, param0 );
2055 brw_MUL( p, x0y1, x0y1, param0 );
2056
2057 /* y component */
2058 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2059 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2060 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2061 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2062
2063 brw_push_insn_state( p );
2064 brw_set_mask_control( p, BRW_MASK_DISABLE );
2065 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
2066 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
2067 brw_pop_insn_state( p );
2068
2069 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2070 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2071 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
2072 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
2073 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
2074
2075 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2076 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2077 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2078 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2079
2080 /* z component */
2081 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2082 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2083 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2084 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2085
2086 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2087 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2088 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2089 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2090
2091 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2092 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2093 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2094 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2095
2096 /* The interpolation coefficients are still around from last time, so
2097 again interpolate in the y dimension... */
2098 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2099 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2100 brw_MUL( p, x0y1, x0y1, yi );
2101 brw_MUL( p, x1y1, x1y1, yi );
2102 brw_ADD( p, x0y0, x0y0, x0y1 );
2103 brw_ADD( p, x1y0, x1y0, x1y1 );
2104
2105 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2106 time put the front face in tmp[ 1 ] and we're nearly there... */
2107 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2108 brw_MUL( p, x1y0, x1y0, xi );
2109 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2110
2111 /* The final interpolation, in the z dimension: */
2112 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2113 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
2114 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2115
2116 /* scale by pow( 2, -15 ), as described above */
2117 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2118
2119 release_tmps( c, mark );
2120 }
2121
2122 static void emit_noise3( struct brw_wm_compile *c,
2123 const struct prog_instruction *inst )
2124 {
2125 struct brw_compile *p = &c->func;
2126 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
2127 GLuint mask = inst->DstReg.WriteMask;
2128 int i;
2129 int mark = mark_tmps( c );
2130
2131 assert( mark == 0 );
2132
2133 src0 = get_src_reg( c, inst, 0, 0 );
2134 src1 = get_src_reg( c, inst, 0, 1 );
2135 src2 = get_src_reg( c, inst, 0, 2 );
2136
2137 param0 = alloc_tmp( c );
2138 param1 = alloc_tmp( c );
2139 param2 = alloc_tmp( c );
2140
2141 brw_MOV( p, param0, src0 );
2142 brw_MOV( p, param1, src1 );
2143 brw_MOV( p, param2, src2 );
2144
2145 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
2146
2147 /* Fill in the result: */
2148 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2149 for (i = 0 ; i < 4; i++) {
2150 if (mask & (1<<i)) {
2151 dst = get_dst_reg(c, inst, i);
2152 brw_MOV( p, dst, param0 );
2153 }
2154 }
2155 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2156 brw_set_saturate( p, 0 );
2157
2158 release_tmps( c, mark );
2159 }
2160
2161 /**
2162 * For the four-dimensional case, the little micro-optimisation benefits
2163 * we obtain by unrolling all the loops aren't worth the massive bloat it
2164 * now causes. Instead, we loop twice around performing a similar operation
2165 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2166 * code to glue it all together.
2167 */
2168 static void noise4_sub( struct brw_wm_compile *c )
2169 {
2170 struct brw_compile *p = &c->func;
2171 struct brw_reg param[ 4 ],
2172 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2173 w0, /* noise for the w=0 cube */
2174 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2175 interp[ 4 ], /* interpolation coefficients */
2176 t, tmp[ 8 ], /* float temporaries */
2177 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2178 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2179 int i, j;
2180 int mark = mark_tmps( c );
2181 GLuint loop, origin;
2182
2183 x0y0 = alloc_tmp( c );
2184 x0y1 = alloc_tmp( c );
2185 x1y0 = alloc_tmp( c );
2186 x1y1 = alloc_tmp( c );
2187 t = alloc_tmp( c );
2188 w0 = alloc_tmp( c );
2189 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2190 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2191
2192 for( i = 0; i < 4; i++ ) {
2193 param[ i ] = lookup_tmp( c, mark - 5 + i );
2194 interp[ i ] = alloc_tmp( c );
2195 }
2196
2197 for( i = 0; i < 8; i++ ) {
2198 tmp[ i ] = alloc_tmp( c );
2199 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2200 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2201 }
2202
2203 brw_set_access_mode( p, BRW_ALIGN_1 );
2204
2205 /* We only want 16 bits of precision from the integral part of each
2206 co-ordinate, but unfortunately the RNDD semantics would saturate
2207 at 16 bits if we performed the operation directly to a 16-bit
2208 destination. Therefore, we round to 32-bit temporaries where
2209 appropriate, and then store only the lower 16 bits. */
2210 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2211 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2212 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2213 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2214 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2215 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2216
2217 /* Modify the flag register here, because the side effect is useful
2218 later (see below). We know for certain that all flags will be
2219 cleared, since the FRC instruction cannot possibly generate
2220 negative results. Even for exceptional inputs (infinities, denormals,
2221 NaNs), the architecture guarantees that the L conditional is false. */
2222 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2223 brw_FRC( p, param[ 0 ], param[ 0 ] );
2224 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2225 for( i = 1; i < 4; i++ )
2226 brw_FRC( p, param[ i ], param[ i ] );
2227
2228 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2229 of all. */
2230 for( i = 0; i < 4; i++ )
2231 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2232 for( i = 0; i < 4; i++ )
2233 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2234 for( i = 0; i < 4; i++ )
2235 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2236 for( i = 0; i < 4; i++ )
2237 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2238 for( j = 0; j < 3; j++ )
2239 for( i = 0; i < 4; i++ )
2240 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2241
2242 /* Mark the current address, as it will be a jump destination. The
2243 following code will be executed twice: first, with the flag
2244 register clear indicating the w=0 case, and second with flags
2245 set for w=1. */
2246 loop = p->nr_insn;
2247
2248 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2249 be hashed. Since we have only 16 bits of precision in the hash, we
2250 must be careful about thorough mixing to maintain entropy as we
2251 squash the input vector into a small scalar. */
2252 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2253 brw_imm_uw( 0xBC8F ) );
2254 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2255 brw_imm_uw( 0xD0BD ) );
2256 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2257 brw_imm_uw( 0x9B93 ) );
2258 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2259 brw_imm_uw( 0xA359 ) );
2260 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2261 brw_imm_uw( 0xBC8F ) );
2262
2263 /* Temporarily disable the execution mask while we work with ExecSize=16
2264 channels (the mask is set for ExecSize=8 and is probably incorrect).
2265 Although this might cause execution of unwanted channels, the code
2266 writes only to temporary registers and has no side effects, so
2267 disabling the mask is harmless. */
2268 brw_push_insn_state( p );
2269 brw_set_mask_control( p, BRW_MASK_DISABLE );
2270 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2271 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2272 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2273
2274 /* We're now ready to perform the hashing. The eight hashes are
2275 interleaved for performance. The hash function used is
2276 designed to rapidly achieve avalanche and require only 16x16
2277 bit multiplication, and 8-bit swizzles (which we get for
2278 free). */
2279 for( i = 0; i < 4; i++ )
2280 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2281 for( i = 0; i < 4; i++ )
2282 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2283 odd_bytes( wtmp[ i ] ) );
2284 for( i = 0; i < 4; i++ )
2285 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2286 for( i = 0; i < 4; i++ )
2287 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2288 odd_bytes( wtmp[ i ] ) );
2289 brw_pop_insn_state( p );
2290
2291 /* Now we want to initialise the four rear gradients based on the
2292 hashes. Format conversion from signed integer to float leaves
2293 everything scaled too high by a factor of pow( 2, 15 ), but
2294 we correct for that right at the end. */
2295 /* x component */
2296 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2297 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2298 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2299 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2300 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2301
2302 brw_push_insn_state( p );
2303 brw_set_mask_control( p, BRW_MASK_DISABLE );
2304 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2305 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2306 brw_pop_insn_state( p );
2307
2308 brw_MUL( p, x1y0, x1y0, t );
2309 brw_MUL( p, x1y1, x1y1, t );
2310 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2311 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2312 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2313
2314 /* y component */
2315 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2316 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2317 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2318 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2319
2320 brw_push_insn_state( p );
2321 brw_set_mask_control( p, BRW_MASK_DISABLE );
2322 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2323 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2324 brw_pop_insn_state( p );
2325
2326 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2327 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2328 /* prepare t for the w component (used below): w the first time through
2329 the loop; w - 1 the second time) */
2330 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2331 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2332 p->current->header.predicate_inverse = 1;
2333 brw_MOV( p, t, param[ 3 ] );
2334 p->current->header.predicate_inverse = 0;
2335 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2336 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2337 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2338
2339 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2340 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2341 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2342 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2343
2344 /* z component */
2345 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2346 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2347 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2348 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2349
2350 brw_push_insn_state( p );
2351 brw_set_mask_control( p, BRW_MASK_DISABLE );
2352 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2353 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2354 brw_pop_insn_state( p );
2355
2356 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2357 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2358 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2359 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2360
2361 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2362 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2363 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2364 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2365
2366 /* w component */
2367 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2368 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2369 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2370 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2371
2372 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2373 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2374 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2375 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2376 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2377
2378 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2379 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2380 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2381 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2382
2383 /* Here we interpolate in the y dimension... */
2384 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2385 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2386 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2387 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2388 brw_ADD( p, x0y0, x0y0, x0y1 );
2389 brw_ADD( p, x1y0, x1y0, x1y1 );
2390
2391 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2392 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2393 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2394 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2395
2396 /* Now do the same thing for the front four gradients... */
2397 /* x component */
2398 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2399 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2400 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2401 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2402
2403 brw_push_insn_state( p );
2404 brw_set_mask_control( p, BRW_MASK_DISABLE );
2405 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2406 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2407 brw_pop_insn_state( p );
2408
2409 brw_MUL( p, x1y0, x1y0, t );
2410 brw_MUL( p, x1y1, x1y1, t );
2411 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2412 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2413 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2414
2415 /* y component */
2416 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2417 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2418 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2419 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2420
2421 brw_push_insn_state( p );
2422 brw_set_mask_control( p, BRW_MASK_DISABLE );
2423 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2424 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2425 brw_pop_insn_state( p );
2426
2427 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2428 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2429 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2430 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2431 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2432
2433 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2434 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2435 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2436 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2437
2438 /* z component */
2439 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2440 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2441 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2442 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2443
2444 brw_push_insn_state( p );
2445 brw_set_mask_control( p, BRW_MASK_DISABLE );
2446 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2447 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2448 brw_pop_insn_state( p );
2449
2450 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2451 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2452 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2453 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2454 /* prepare t for the w component (used below): w the first time through
2455 the loop; w - 1 the second time) */
2456 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2457 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2458 p->current->header.predicate_inverse = 1;
2459 brw_MOV( p, t, param[ 3 ] );
2460 p->current->header.predicate_inverse = 0;
2461 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2462
2463 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2464 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2465 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2466 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2467
2468 /* w component */
2469 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2470 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2471 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2472 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2473
2474 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2475 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2476 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2477 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2478
2479 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2480 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2481 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2482 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2483
2484 /* Interpolate in the y dimension: */
2485 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2486 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2487 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2488 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2489 brw_ADD( p, x0y0, x0y0, x0y1 );
2490 brw_ADD( p, x1y0, x1y0, x1y1 );
2491
2492 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2493 time put the front face in tmp[ 1 ] and we're nearly there... */
2494 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2495 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2496 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2497
2498 /* Another interpolation, in the z dimension: */
2499 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2500 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2501 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2502
2503 /* Exit the loop if we've computed both cubes... */
2504 origin = p->nr_insn;
2505 brw_push_insn_state( p );
2506 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2507 brw_set_mask_control( p, BRW_MASK_DISABLE );
2508 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2509 brw_pop_insn_state( p );
2510
2511 /* Save the result for the w=0 case, and increment the w coordinate: */
2512 brw_MOV( p, w0, tmp[ 0 ] );
2513 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2514 brw_imm_uw( 1 ) );
2515
2516 /* Loop around for the other cube. Explicitly set the flag register
2517 (unfortunately we must spend an extra instruction to do this: we
2518 can't rely on a side effect of the previous MOV or ADD because
2519 conditional modifiers which are normally true might be false in
2520 exceptional circumstances, e.g. given a NaN input; the add to
2521 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2522 brw_push_insn_state( p );
2523 brw_set_mask_control( p, BRW_MASK_DISABLE );
2524 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2525 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2526 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2527 brw_pop_insn_state( p );
2528
2529 /* Patch the previous conditional branch now that we know the
2530 destination address. */
2531 brw_set_src1( p->store + origin,
2532 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2533
2534 /* The very last interpolation. */
2535 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2536 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2537 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2538
2539 /* scale by pow( 2, -15 ), as described above */
2540 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2541
2542 release_tmps( c, mark );
2543 }
2544
2545 static void emit_noise4( struct brw_wm_compile *c,
2546 const struct prog_instruction *inst )
2547 {
2548 struct brw_compile *p = &c->func;
2549 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2550 GLuint mask = inst->DstReg.WriteMask;
2551 int i;
2552 int mark = mark_tmps( c );
2553
2554 assert( mark == 0 );
2555
2556 src0 = get_src_reg( c, inst, 0, 0 );
2557 src1 = get_src_reg( c, inst, 0, 1 );
2558 src2 = get_src_reg( c, inst, 0, 2 );
2559 src3 = get_src_reg( c, inst, 0, 3 );
2560
2561 param0 = alloc_tmp( c );
2562 param1 = alloc_tmp( c );
2563 param2 = alloc_tmp( c );
2564 param3 = alloc_tmp( c );
2565
2566 brw_MOV( p, param0, src0 );
2567 brw_MOV( p, param1, src1 );
2568 brw_MOV( p, param2, src2 );
2569 brw_MOV( p, param3, src3 );
2570
2571 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2572
2573 /* Fill in the result: */
2574 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2575 for (i = 0 ; i < 4; i++) {
2576 if (mask & (1<<i)) {
2577 dst = get_dst_reg(c, inst, i);
2578 brw_MOV( p, dst, param0 );
2579 }
2580 }
2581 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2582 brw_set_saturate( p, 0 );
2583
2584 release_tmps( c, mark );
2585 }
2586
2587 static void emit_wpos_xy(struct brw_wm_compile *c,
2588 const struct prog_instruction *inst)
2589 {
2590 struct brw_compile *p = &c->func;
2591 GLuint mask = inst->DstReg.WriteMask;
2592 struct brw_reg src0[2], dst[2];
2593
2594 dst[0] = get_dst_reg(c, inst, 0);
2595 dst[1] = get_dst_reg(c, inst, 1);
2596
2597 src0[0] = get_src_reg(c, inst, 0, 0);
2598 src0[1] = get_src_reg(c, inst, 0, 1);
2599
2600 /* Calculate the pixel offset from window bottom left into destination
2601 * X and Y channels.
2602 */
2603 if (mask & WRITEMASK_X) {
2604 /* X' = X - origin_x */
2605 brw_ADD(p,
2606 dst[0],
2607 retype(src0[0], BRW_REGISTER_TYPE_W),
2608 brw_imm_d(0 - c->key.origin_x));
2609 }
2610
2611 if (mask & WRITEMASK_Y) {
2612 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2613 brw_ADD(p,
2614 dst[1],
2615 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2616 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2617 }
2618 }
2619
2620 /* TODO
2621 BIAS on SIMD8 not working yet...
2622 */
2623 static void emit_txb(struct brw_wm_compile *c,
2624 const struct prog_instruction *inst)
2625 {
2626 struct brw_compile *p = &c->func;
2627 struct brw_reg dst[4], src[4], payload_reg;
2628 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2629 GLuint i;
2630
2631 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2632
2633 for (i = 0; i < 4; i++)
2634 dst[i] = get_dst_reg(c, inst, i);
2635 for (i = 0; i < 4; i++)
2636 src[i] = get_src_reg(c, inst, 0, i);
2637
2638 switch (inst->TexSrcTarget) {
2639 case TEXTURE_1D_INDEX:
2640 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2641 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2642 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2643 break;
2644 case TEXTURE_2D_INDEX:
2645 case TEXTURE_RECT_INDEX:
2646 brw_MOV(p, brw_message_reg(2), src[0]);
2647 brw_MOV(p, brw_message_reg(3), src[1]);
2648 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2649 break;
2650 default:
2651 brw_MOV(p, brw_message_reg(2), src[0]);
2652 brw_MOV(p, brw_message_reg(3), src[1]);
2653 brw_MOV(p, brw_message_reg(4), src[2]);
2654 break;
2655 }
2656 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2657 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2658 brw_SAMPLE(p,
2659 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2660 1, /* msg_reg_nr */
2661 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2662 SURF_INDEX_TEXTURE(unit),
2663 unit, /* sampler */
2664 inst->DstReg.WriteMask, /* writemask */
2665 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS, /* msg_type */
2666 4, /* response_length */
2667 4, /* msg_length */
2668 0); /* eot */
2669 }
2670
2671
2672 static void emit_tex(struct brw_wm_compile *c,
2673 const struct prog_instruction *inst)
2674 {
2675 struct brw_compile *p = &c->func;
2676 struct brw_reg dst[4], src[4], payload_reg;
2677 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2678 GLuint msg_len;
2679 GLuint i, nr;
2680 GLuint emit;
2681 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2682
2683 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2684
2685 for (i = 0; i < 4; i++)
2686 dst[i] = get_dst_reg(c, inst, i);
2687 for (i = 0; i < 4; i++)
2688 src[i] = get_src_reg(c, inst, 0, i);
2689
2690 switch (inst->TexSrcTarget) {
2691 case TEXTURE_1D_INDEX:
2692 emit = WRITEMASK_X;
2693 nr = 1;
2694 break;
2695 case TEXTURE_2D_INDEX:
2696 case TEXTURE_RECT_INDEX:
2697 emit = WRITEMASK_XY;
2698 nr = 2;
2699 break;
2700 default:
2701 emit = WRITEMASK_XYZ;
2702 nr = 3;
2703 break;
2704 }
2705 msg_len = 1;
2706
2707 /* move/load S, T, R coords */
2708 for (i = 0; i < nr; i++) {
2709 static const GLuint swz[4] = {0,1,2,2};
2710 if (emit & (1<<i))
2711 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2712 else
2713 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2714 msg_len += 1;
2715 }
2716
2717 if (shadow) {
2718 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2719 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2720 }
2721
2722 brw_SAMPLE(p,
2723 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2724 1, /* msg_reg_nr */
2725 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2726 SURF_INDEX_TEXTURE(unit),
2727 unit, /* sampler */
2728 inst->DstReg.WriteMask, /* writemask */
2729 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE, /* msg_type */
2730 4, /* response_length */
2731 shadow ? 6 : 4, /* msg_length */
2732 0); /* eot */
2733
2734 if (shadow)
2735 brw_MOV(p, dst[3], brw_imm_f(1.0));
2736 }
2737
2738
2739 /**
2740 * Resolve subroutine calls after code emit is done.
2741 */
2742 static void post_wm_emit( struct brw_wm_compile *c )
2743 {
2744 brw_resolve_cals(&c->func);
2745 }
2746
2747 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2748 {
2749 #define MAX_IFSN 32
2750 #define MAX_LOOP_DEPTH 32
2751 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2752 struct brw_instruction *inst0, *inst1;
2753 int i, if_insn = 0, loop_insn = 0;
2754 struct brw_compile *p = &c->func;
2755 struct brw_indirect stack_index = brw_indirect(0, 0);
2756
2757 c->out_of_regs = GL_FALSE;
2758
2759 prealloc_reg(c);
2760 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2761 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2762
2763 for (i = 0; i < c->nr_fp_insns; i++) {
2764 const struct prog_instruction *inst = &c->prog_instructions[i];
2765
2766 c->cur_inst = i;
2767
2768 #if 0
2769 _mesa_printf("Inst %d: ", i);
2770 _mesa_print_instruction(inst);
2771 #endif
2772
2773 /* fetch any constants that this instruction needs */
2774 if (c->fp->use_const_buffer)
2775 fetch_constants(c, inst);
2776
2777 if (inst->CondUpdate)
2778 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2779 else
2780 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2781
2782 switch (inst->Opcode) {
2783 case WM_PIXELXY:
2784 emit_pixel_xy(c, inst);
2785 break;
2786 case WM_DELTAXY:
2787 emit_delta_xy(c, inst);
2788 break;
2789 case WM_PIXELW:
2790 emit_pixel_w(c, inst);
2791 break;
2792 case WM_LINTERP:
2793 emit_linterp(c, inst);
2794 break;
2795 case WM_PINTERP:
2796 emit_pinterp(c, inst);
2797 break;
2798 case WM_CINTERP:
2799 emit_cinterp(c, inst);
2800 break;
2801 case WM_WPOSXY:
2802 emit_wpos_xy(c, inst);
2803 break;
2804 case WM_FB_WRITE:
2805 emit_fb_write(c, inst);
2806 break;
2807 case WM_FRONTFACING:
2808 emit_frontfacing(c, inst);
2809 break;
2810 case OPCODE_ABS:
2811 emit_abs(c, inst);
2812 break;
2813 case OPCODE_ADD:
2814 emit_add(c, inst);
2815 break;
2816 case OPCODE_ARL:
2817 emit_arl(c, inst);
2818 break;
2819 case OPCODE_SUB:
2820 emit_sub(c, inst);
2821 break;
2822 case OPCODE_FRC:
2823 emit_frc(c, inst);
2824 break;
2825 case OPCODE_FLR:
2826 emit_flr(c, inst);
2827 break;
2828 case OPCODE_LRP:
2829 emit_lrp(c, inst);
2830 break;
2831 case OPCODE_TRUNC:
2832 emit_trunc(c, inst);
2833 break;
2834 case OPCODE_MOV:
2835 emit_mov(c, inst);
2836 break;
2837 case OPCODE_DP3:
2838 emit_dp3(c, inst);
2839 break;
2840 case OPCODE_DP4:
2841 emit_dp4(c, inst);
2842 break;
2843 case OPCODE_XPD:
2844 emit_xpd(c, inst);
2845 break;
2846 case OPCODE_DPH:
2847 emit_dph(c, inst);
2848 break;
2849 case OPCODE_RCP:
2850 emit_rcp(c, inst);
2851 break;
2852 case OPCODE_RSQ:
2853 emit_rsq(c, inst);
2854 break;
2855 case OPCODE_SIN:
2856 emit_sin(c, inst);
2857 break;
2858 case OPCODE_COS:
2859 emit_cos(c, inst);
2860 break;
2861 case OPCODE_EX2:
2862 emit_ex2(c, inst);
2863 break;
2864 case OPCODE_LG2:
2865 emit_lg2(c, inst);
2866 break;
2867 case OPCODE_MIN:
2868 case OPCODE_MAX:
2869 emit_min_max(c, inst);
2870 break;
2871 case OPCODE_DDX:
2872 emit_ddx(c, inst);
2873 break;
2874 case OPCODE_DDY:
2875 emit_ddy(c, inst);
2876 break;
2877 case OPCODE_SLT:
2878 emit_slt(c, inst);
2879 break;
2880 case OPCODE_SLE:
2881 emit_sle(c, inst);
2882 break;
2883 case OPCODE_SGT:
2884 emit_sgt(c, inst);
2885 break;
2886 case OPCODE_SGE:
2887 emit_sge(c, inst);
2888 break;
2889 case OPCODE_SEQ:
2890 emit_seq(c, inst);
2891 break;
2892 case OPCODE_SNE:
2893 emit_sne(c, inst);
2894 break;
2895 case OPCODE_MUL:
2896 emit_mul(c, inst);
2897 break;
2898 case OPCODE_POW:
2899 emit_pow(c, inst);
2900 break;
2901 case OPCODE_MAD:
2902 emit_mad(c, inst);
2903 break;
2904 case OPCODE_NOISE1:
2905 emit_noise1(c, inst);
2906 break;
2907 case OPCODE_NOISE2:
2908 emit_noise2(c, inst);
2909 break;
2910 case OPCODE_NOISE3:
2911 emit_noise3(c, inst);
2912 break;
2913 case OPCODE_NOISE4:
2914 emit_noise4(c, inst);
2915 break;
2916 case OPCODE_TEX:
2917 emit_tex(c, inst);
2918 break;
2919 case OPCODE_TXB:
2920 emit_txb(c, inst);
2921 break;
2922 case OPCODE_KIL_NV:
2923 emit_kil(c);
2924 break;
2925 case OPCODE_IF:
2926 assert(if_insn < MAX_IFSN);
2927 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2928 break;
2929 case OPCODE_ELSE:
2930 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2931 break;
2932 case OPCODE_ENDIF:
2933 assert(if_insn > 0);
2934 brw_ENDIF(p, if_inst[--if_insn]);
2935 break;
2936 case OPCODE_BGNSUB:
2937 brw_save_label(p, inst->Comment, p->nr_insn);
2938 break;
2939 case OPCODE_ENDSUB:
2940 /* no-op */
2941 break;
2942 case OPCODE_CAL:
2943 brw_push_insn_state(p);
2944 brw_set_mask_control(p, BRW_MASK_DISABLE);
2945 brw_set_access_mode(p, BRW_ALIGN_1);
2946 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2947 brw_set_access_mode(p, BRW_ALIGN_16);
2948 brw_ADD(p, get_addr_reg(stack_index),
2949 get_addr_reg(stack_index), brw_imm_d(4));
2950 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2951 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2952 brw_pop_insn_state(p);
2953 break;
2954
2955 case OPCODE_RET:
2956 brw_push_insn_state(p);
2957 brw_set_mask_control(p, BRW_MASK_DISABLE);
2958 brw_ADD(p, get_addr_reg(stack_index),
2959 get_addr_reg(stack_index), brw_imm_d(-4));
2960 brw_set_access_mode(p, BRW_ALIGN_1);
2961 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2962 brw_set_access_mode(p, BRW_ALIGN_16);
2963 brw_pop_insn_state(p);
2964
2965 break;
2966 case OPCODE_BGNLOOP:
2967 /* XXX may need to invalidate the current_constant regs */
2968 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2969 break;
2970 case OPCODE_BRK:
2971 brw_BREAK(p);
2972 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2973 break;
2974 case OPCODE_CONT:
2975 brw_CONT(p);
2976 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2977 break;
2978 case OPCODE_ENDLOOP:
2979 loop_insn--;
2980 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2981 /* patch all the BREAK instructions from
2982 last BEGINLOOP */
2983 while (inst0 > loop_inst[loop_insn]) {
2984 inst0--;
2985 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2986 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2987 inst0->bits3.if_else.pop_count = 0;
2988 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2989 inst0->bits3.if_else.jump_count = inst1 - inst0;
2990 inst0->bits3.if_else.pop_count = 0;
2991 }
2992 }
2993 break;
2994 default:
2995 _mesa_printf("unsupported IR in fragment shader %d\n",
2996 inst->Opcode);
2997 }
2998
2999 if (inst->CondUpdate)
3000 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
3001 else
3002 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3003 }
3004 post_wm_emit(c);
3005 }
3006
3007
3008 /**
3009 * Do GPU code generation for shaders that use GLSL features such as
3010 * flow control. Other shaders will be compiled with the
3011 */
3012 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
3013 {
3014 if (INTEL_DEBUG & DEBUG_WM) {
3015 _mesa_printf("brw_wm_glsl_emit:\n");
3016 }
3017
3018 /* initial instruction translation/simplification */
3019 brw_wm_pass_fp(c);
3020
3021 /* actual code generation */
3022 brw_wm_emit_glsl(brw, c);
3023
3024 if (INTEL_DEBUG & DEBUG_WM) {
3025 brw_wm_print_program(c, "brw_wm_glsl_emit done");
3026 }
3027
3028 c->prog_data.total_grf = num_grf_used(c);
3029 c->prog_data.total_scratch = 0;
3030 }