fdd31d4ed50f18fd4ad503315086608c42205432
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "shader/prog_print.h"
4 #include "shader/prog_optimize.h"
5 #include "brw_context.h"
6 #include "brw_eu.h"
7 #include "brw_wm.h"
8
9 enum _subroutine {
10 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
11 };
12
13 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
14 const struct prog_instruction *inst,
15 GLuint component);
16
17 /**
18 * Determine if the given fragment program uses GLSL features such
19 * as flow conditionals, loops, subroutines.
20 * Some GLSL shaders may use these features, others might not.
21 */
22 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
23 {
24 int i;
25
26 for (i = 0; i < fp->Base.NumInstructions; i++) {
27 const struct prog_instruction *inst = &fp->Base.Instructions[i];
28 switch (inst->Opcode) {
29 case OPCODE_ARL:
30 case OPCODE_IF:
31 case OPCODE_ENDIF:
32 case OPCODE_CAL:
33 case OPCODE_BRK:
34 case OPCODE_RET:
35 case OPCODE_NOISE1:
36 case OPCODE_NOISE2:
37 case OPCODE_NOISE3:
38 case OPCODE_NOISE4:
39 case OPCODE_BGNLOOP:
40 return GL_TRUE;
41 default:
42 break;
43 }
44 }
45 return GL_FALSE;
46 }
47
48
49
50 static void
51 reclaim_temps(struct brw_wm_compile *c);
52
53
54 /** Mark GRF register as used. */
55 static void
56 prealloc_grf(struct brw_wm_compile *c, int r)
57 {
58 c->used_grf[r] = GL_TRUE;
59 }
60
61
62 /** Mark given GRF register as not in use. */
63 static void
64 release_grf(struct brw_wm_compile *c, int r)
65 {
66 /*assert(c->used_grf[r]);*/
67 c->used_grf[r] = GL_FALSE;
68 c->first_free_grf = MIN2(c->first_free_grf, r);
69 }
70
71
72 /** Return index of a free GRF, mark it as used. */
73 static int
74 alloc_grf(struct brw_wm_compile *c)
75 {
76 GLuint r;
77 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
78 if (!c->used_grf[r]) {
79 c->used_grf[r] = GL_TRUE;
80 c->first_free_grf = r + 1; /* a guess */
81 return r;
82 }
83 }
84
85 /* no free temps, try to reclaim some */
86 reclaim_temps(c);
87 c->first_free_grf = 0;
88
89 /* try alloc again */
90 for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
91 if (!c->used_grf[r]) {
92 c->used_grf[r] = GL_TRUE;
93 c->first_free_grf = r + 1; /* a guess */
94 return r;
95 }
96 }
97
98 for (r = 0; r < BRW_WM_MAX_GRF; r++) {
99 assert(c->used_grf[r]);
100 }
101
102 /* really, no free GRF regs found */
103 if (!c->out_of_regs) {
104 /* print warning once per compilation */
105 _mesa_warning(NULL, "i965: ran out of registers for fragment program");
106 c->out_of_regs = GL_TRUE;
107 }
108
109 return -1;
110 }
111
112
113 /** Return number of GRF registers used */
114 static int
115 num_grf_used(const struct brw_wm_compile *c)
116 {
117 int r;
118 for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
119 if (c->used_grf[r])
120 return r + 1;
121 return 0;
122 }
123
124
125
126 /**
127 * Record the mapping of a Mesa register to a hardware register.
128 */
129 static void set_reg(struct brw_wm_compile *c, int file, int index,
130 int component, struct brw_reg reg)
131 {
132 c->wm_regs[file][index][component].reg = reg;
133 c->wm_regs[file][index][component].inited = GL_TRUE;
134 }
135
136 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
137 {
138 struct brw_reg reg;
139
140 /* if we need to allocate another temp, grow the tmp_regs[] array */
141 if (c->tmp_index == c->tmp_max) {
142 int r = alloc_grf(c);
143 if (r < 0) {
144 /*printf("Out of temps in %s\n", __FUNCTION__);*/
145 r = 50; /* XXX random register! */
146 }
147 c->tmp_regs[ c->tmp_max++ ] = r;
148 }
149
150 /* form the GRF register */
151 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
152 /*printf("alloc_temp %d\n", reg.nr);*/
153 assert(reg.nr < BRW_WM_MAX_GRF);
154 return reg;
155
156 }
157
158 /**
159 * Save current temp register info.
160 * There must be a matching call to release_tmps().
161 */
162 static int mark_tmps(struct brw_wm_compile *c)
163 {
164 return c->tmp_index;
165 }
166
167 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
168 {
169 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
170 }
171
172 static void release_tmps(struct brw_wm_compile *c, int mark)
173 {
174 c->tmp_index = mark;
175 }
176
177 /**
178 * Convert Mesa src register to brw register.
179 *
180 * Since we're running in SOA mode each Mesa register corresponds to four
181 * hardware registers. We allocate the hardware registers as needed here.
182 *
183 * \param file register file, one of PROGRAM_x
184 * \param index register number
185 * \param component src component (X=0, Y=1, Z=2, W=3)
186 * \param nr not used?!?
187 * \param neg negate value?
188 * \param abs take absolute value?
189 */
190 static struct brw_reg
191 get_reg(struct brw_wm_compile *c, int file, int index, int component,
192 int nr, GLuint neg, GLuint abs)
193 {
194 struct brw_reg reg;
195 switch (file) {
196 case PROGRAM_STATE_VAR:
197 case PROGRAM_CONSTANT:
198 case PROGRAM_UNIFORM:
199 file = PROGRAM_STATE_VAR;
200 break;
201 case PROGRAM_UNDEFINED:
202 return brw_null_reg();
203 case PROGRAM_TEMPORARY:
204 case PROGRAM_INPUT:
205 case PROGRAM_OUTPUT:
206 case PROGRAM_PAYLOAD:
207 break;
208 default:
209 _mesa_problem(NULL, "Unexpected file in get_reg()");
210 return brw_null_reg();
211 }
212
213 assert(index < 256);
214 assert(component < 4);
215
216 /* see if we've already allocated a HW register for this Mesa register */
217 if (c->wm_regs[file][index][component].inited) {
218 /* yes, re-use */
219 reg = c->wm_regs[file][index][component].reg;
220 }
221 else {
222 /* no, allocate new register */
223 int grf = alloc_grf(c);
224 /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
225 if (grf < 0) {
226 /* totally out of temps */
227 grf = 51; /* XXX random register! */
228 }
229
230 reg = brw_vec8_grf(grf, 0);
231 /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
232
233 set_reg(c, file, index, component, reg);
234 }
235
236 if (neg & (1 << component)) {
237 reg = negate(reg);
238 }
239 if (abs)
240 reg = brw_abs(reg);
241 return reg;
242 }
243
244
245
246 /**
247 * This is called if we run out of GRF registers. Examine the live intervals
248 * of temp regs in the program and free those which won't be used again.
249 */
250 static void
251 reclaim_temps(struct brw_wm_compile *c)
252 {
253 GLint intBegin[MAX_PROGRAM_TEMPS];
254 GLint intEnd[MAX_PROGRAM_TEMPS];
255 int index;
256
257 /*printf("Reclaim temps:\n");*/
258
259 _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
260 intBegin, intEnd);
261
262 for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
263 if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
264 /* program temp[i] can be freed */
265 int component;
266 /*printf(" temp[%d] is dead\n", index);*/
267 for (component = 0; component < 4; component++) {
268 if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
269 int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
270 release_grf(c, r);
271 /*
272 printf(" Reclaim temp %d, reg %d at inst %d\n",
273 index, r, c->cur_inst);
274 */
275 c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
276 }
277 }
278 }
279 }
280 }
281
282
283
284
285 /**
286 * Preallocate registers. This sets up the Mesa to hardware register
287 * mapping for certain registers, such as constants (uniforms/state vars)
288 * and shader inputs.
289 */
290 static void prealloc_reg(struct brw_wm_compile *c)
291 {
292 int i, j;
293 struct brw_reg reg;
294 int urb_read_length = 0;
295 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
296 GLuint reg_index = 0;
297
298 memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
299 c->first_free_grf = 0;
300
301 for (i = 0; i < 4; i++) {
302 if (i < c->key.nr_depth_regs)
303 reg = brw_vec8_grf(i * 2, 0);
304 else
305 reg = brw_vec8_grf(0, 0);
306 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
307 }
308 reg_index += 2 * c->key.nr_depth_regs;
309
310 /* constants */
311 {
312 const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
313 const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
314
315 /* use a real constant buffer, or just use a section of the GRF? */
316 /* XXX this heuristic may need adjustment... */
317 if ((nr_params + nr_temps) * 4 + reg_index > 80)
318 c->fp->use_const_buffer = GL_TRUE;
319 else
320 c->fp->use_const_buffer = GL_FALSE;
321 /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
322
323 if (c->fp->use_const_buffer) {
324 /* We'll use a real constant buffer and fetch constants from
325 * it with a dataport read message.
326 */
327
328 /* number of float constants in CURBE */
329 c->prog_data.nr_params = 0;
330 }
331 else {
332 const struct gl_program_parameter_list *plist =
333 c->fp->program.Base.Parameters;
334 int index = 0;
335
336 /* number of float constants in CURBE */
337 c->prog_data.nr_params = 4 * nr_params;
338
339 /* loop over program constants (float[4]) */
340 for (i = 0; i < nr_params; i++) {
341 /* loop over XYZW channels */
342 for (j = 0; j < 4; j++, index++) {
343 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
344 /* Save pointer to parameter/constant value.
345 * Constants will be copied in prepare_constant_buffer()
346 */
347 c->prog_data.param[index] = &plist->ParameterValues[i][j];
348 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
349 }
350 }
351 /* number of constant regs used (each reg is float[8]) */
352 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
353 reg_index += c->nr_creg;
354 }
355 }
356
357 /* fragment shader inputs */
358 for (i = 0; i < VERT_RESULT_MAX; i++) {
359 int fp_input;
360
361 if (i >= VERT_RESULT_VAR0)
362 fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
363 else if (i <= VERT_RESULT_TEX7)
364 fp_input = i;
365 else
366 fp_input = -1;
367
368 if (fp_input >= 0 && inputs & (1 << fp_input)) {
369 urb_read_length = reg_index;
370 reg = brw_vec8_grf(reg_index, 0);
371 for (j = 0; j < 4; j++)
372 set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
373 }
374 if (c->key.vp_outputs_written & (1 << i)) {
375 reg_index += 2;
376 }
377 }
378
379 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
380 c->prog_data.urb_read_length = urb_read_length;
381 c->prog_data.curb_read_length = c->nr_creg;
382 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
383 reg_index++;
384 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
385 reg_index += 2;
386
387 /* mark GRF regs [0..reg_index-1] as in-use */
388 for (i = 0; i < reg_index; i++)
389 prealloc_grf(c, i);
390
391 /* Don't use GRF 126, 127. Using them seems to lead to GPU lock-ups */
392 prealloc_grf(c, 126);
393 prealloc_grf(c, 127);
394
395 for (i = 0; i < c->nr_fp_insns; i++) {
396 const struct prog_instruction *inst = &c->prog_instructions[i];
397 struct brw_reg dst[4];
398
399 switch (inst->Opcode) {
400 case OPCODE_TEX:
401 case OPCODE_TXB:
402 /* Allocate the channels of texture results contiguously,
403 * since they are written out that way by the sampler unit.
404 */
405 for (j = 0; j < 4; j++) {
406 dst[j] = get_dst_reg(c, inst, j);
407 if (j != 0)
408 assert(dst[j].nr == dst[j - 1].nr + 1);
409 }
410 break;
411 default:
412 break;
413 }
414 }
415
416 /* An instruction may reference up to three constants.
417 * They'll be found in these registers.
418 * XXX alloc these on demand!
419 */
420 if (c->fp->use_const_buffer) {
421 for (i = 0; i < 3; i++) {
422 c->current_const[i].index = -1;
423 c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
424 }
425 }
426 #if 0
427 printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
428 printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
429 #endif
430 }
431
432
433 /**
434 * Check if any of the instruction's src registers are constants, uniforms,
435 * or statevars. If so, fetch any constants that we don't already have in
436 * the three GRF slots.
437 */
438 static void fetch_constants(struct brw_wm_compile *c,
439 const struct prog_instruction *inst)
440 {
441 struct brw_compile *p = &c->func;
442 GLuint i;
443
444 /* loop over instruction src regs */
445 for (i = 0; i < 3; i++) {
446 const struct prog_src_register *src = &inst->SrcReg[i];
447 if (src->File == PROGRAM_STATE_VAR ||
448 src->File == PROGRAM_CONSTANT ||
449 src->File == PROGRAM_UNIFORM) {
450 c->current_const[i].index = src->Index;
451
452 #if 0
453 printf(" fetch const[%d] for arg %d into reg %d\n",
454 src->Index, i, c->current_const[i].reg.nr);
455 #endif
456
457 /* need to fetch the constant now */
458 brw_dp_READ_4(p,
459 c->current_const[i].reg, /* writeback dest */
460 src->RelAddr, /* relative indexing? */
461 16 * src->Index, /* byte offset */
462 SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
463 );
464 }
465 }
466 }
467
468
469 /**
470 * Convert Mesa dst register to brw register.
471 */
472 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
473 const struct prog_instruction *inst,
474 GLuint component)
475 {
476 const int nr = 1;
477 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
478 0, 0);
479 }
480
481
482 static struct brw_reg
483 get_src_reg_const(struct brw_wm_compile *c,
484 const struct prog_instruction *inst,
485 GLuint srcRegIndex, GLuint component)
486 {
487 /* We should have already fetched the constant from the constant
488 * buffer in fetch_constants(). Now we just have to return a
489 * register description that extracts the needed component and
490 * smears it across all eight vector components.
491 */
492 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
493 struct brw_reg const_reg;
494
495 assert(component < 4);
496 assert(srcRegIndex < 3);
497 assert(c->current_const[srcRegIndex].index != -1);
498 const_reg = c->current_const[srcRegIndex].reg;
499
500 /* extract desired float from the const_reg, and smear */
501 const_reg = stride(const_reg, 0, 1, 0);
502 const_reg.subnr = component * 4;
503
504 if (src->Negate & (1 << component))
505 const_reg = negate(const_reg);
506 if (src->Abs)
507 const_reg = brw_abs(const_reg);
508
509 #if 0
510 printf(" form const[%d].%d for arg %d, reg %d\n",
511 c->current_const[srcRegIndex].index,
512 component,
513 srcRegIndex,
514 const_reg.nr);
515 #endif
516
517 return const_reg;
518 }
519
520
521 /**
522 * Convert Mesa src register to brw register.
523 */
524 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
525 const struct prog_instruction *inst,
526 GLuint srcRegIndex, GLuint channel)
527 {
528 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
529 const GLuint nr = 1;
530 const GLuint component = GET_SWZ(src->Swizzle, channel);
531
532 /* Extended swizzle terms */
533 if (component == SWIZZLE_ZERO) {
534 return brw_imm_f(0.0F);
535 }
536 else if (component == SWIZZLE_ONE) {
537 return brw_imm_f(1.0F);
538 }
539
540 if (c->fp->use_const_buffer &&
541 (src->File == PROGRAM_STATE_VAR ||
542 src->File == PROGRAM_CONSTANT ||
543 src->File == PROGRAM_UNIFORM)) {
544 return get_src_reg_const(c, inst, srcRegIndex, component);
545 }
546 else {
547 /* other type of source register */
548 return get_reg(c, src->File, src->Index, component, nr,
549 src->Negate, src->Abs);
550 }
551 }
552
553
554 /**
555 * Same as \sa get_src_reg() but if the register is a literal, emit
556 * a brw_reg encoding the literal.
557 * Note that a brw instruction only allows one src operand to be a literal.
558 * For instructions with more than one operand, only the second can be a
559 * literal. This means that we treat some literals as constants/uniforms
560 * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
561 *
562 */
563 static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c,
564 const struct prog_instruction *inst,
565 GLuint srcRegIndex, GLuint channel)
566 {
567 const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
568 if (src->File == PROGRAM_CONSTANT) {
569 /* a literal */
570 const int component = GET_SWZ(src->Swizzle, channel);
571 const GLfloat *param =
572 c->fp->program.Base.Parameters->ParameterValues[src->Index];
573 GLfloat value = param[component];
574 if (src->Negate & (1 << channel))
575 value = -value;
576 if (src->Abs)
577 value = FABSF(value);
578 #if 0
579 printf(" form immed value %f for chan %d\n", value, channel);
580 #endif
581 return brw_imm_f(value);
582 }
583 else {
584 return get_src_reg(c, inst, srcRegIndex, channel);
585 }
586 }
587
588
589 /**
590 * Subroutines are minimal support for resusable instruction sequences.
591 * They are implemented as simply as possible to minimise overhead: there
592 * is no explicit support for communication between the caller and callee
593 * other than saving the return address in a temporary register, nor is
594 * there any automatic local storage. This implies that great care is
595 * required before attempting reentrancy or any kind of nested
596 * subroutine invocations.
597 */
598 static void invoke_subroutine( struct brw_wm_compile *c,
599 enum _subroutine subroutine,
600 void (*emit)( struct brw_wm_compile * ) )
601 {
602 struct brw_compile *p = &c->func;
603
604 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
605
606 if( c->subroutines[ subroutine ] ) {
607 /* subroutine previously emitted: reuse existing instructions */
608
609 int mark = mark_tmps( c );
610 struct brw_reg return_address = retype( alloc_tmp( c ),
611 BRW_REGISTER_TYPE_UD );
612 int here = p->nr_insn;
613
614 brw_push_insn_state(p);
615 brw_set_mask_control(p, BRW_MASK_DISABLE);
616 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
617
618 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
619 brw_imm_d( ( c->subroutines[ subroutine ] -
620 here - 1 ) << 4 ) );
621 brw_pop_insn_state(p);
622
623 release_tmps( c, mark );
624 } else {
625 /* previously unused subroutine: emit, and mark for later reuse */
626
627 int mark = mark_tmps( c );
628 struct brw_reg return_address = retype( alloc_tmp( c ),
629 BRW_REGISTER_TYPE_UD );
630 struct brw_instruction *calc;
631 int base = p->nr_insn;
632
633 brw_push_insn_state(p);
634 brw_set_mask_control(p, BRW_MASK_DISABLE);
635 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
636 brw_pop_insn_state(p);
637
638 c->subroutines[ subroutine ] = p->nr_insn;
639
640 emit( c );
641
642 brw_push_insn_state(p);
643 brw_set_mask_control(p, BRW_MASK_DISABLE);
644 brw_MOV( p, brw_ip_reg(), return_address );
645 brw_pop_insn_state(p);
646
647 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
648
649 release_tmps( c, mark );
650 }
651 }
652
653 static void emit_trunc( struct brw_wm_compile *c,
654 const struct prog_instruction *inst)
655 {
656 int i;
657 struct brw_compile *p = &c->func;
658 GLuint mask = inst->DstReg.WriteMask;
659 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
660 for (i = 0; i < 4; i++) {
661 if (mask & (1<<i)) {
662 struct brw_reg src, dst;
663 dst = get_dst_reg(c, inst, i);
664 src = get_src_reg(c, inst, 0, i);
665 brw_RNDZ(p, dst, src);
666 }
667 }
668 brw_set_saturate(p, 0);
669 }
670
671 static void emit_pixel_xy(struct brw_wm_compile *c,
672 const struct prog_instruction *inst)
673 {
674 struct brw_reg r1 = brw_vec1_grf(1, 0);
675 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
676
677 struct brw_reg dst0, dst1;
678 struct brw_compile *p = &c->func;
679 GLuint mask = inst->DstReg.WriteMask;
680
681 dst0 = get_dst_reg(c, inst, 0);
682 dst1 = get_dst_reg(c, inst, 1);
683 /* Calculate pixel centers by adding 1 or 0 to each of the
684 * micro-tile coordinates passed in r1.
685 */
686 if (mask & WRITEMASK_X) {
687 brw_ADD(p,
688 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
689 stride(suboffset(r1_uw, 4), 2, 4, 0),
690 brw_imm_v(0x10101010));
691 }
692
693 if (mask & WRITEMASK_Y) {
694 brw_ADD(p,
695 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
696 stride(suboffset(r1_uw, 5), 2, 4, 0),
697 brw_imm_v(0x11001100));
698 }
699 }
700
701 static void emit_delta_xy(struct brw_wm_compile *c,
702 const struct prog_instruction *inst)
703 {
704 struct brw_reg r1 = brw_vec1_grf(1, 0);
705 struct brw_reg dst0, dst1, src0, src1;
706 struct brw_compile *p = &c->func;
707 GLuint mask = inst->DstReg.WriteMask;
708
709 dst0 = get_dst_reg(c, inst, 0);
710 dst1 = get_dst_reg(c, inst, 1);
711 src0 = get_src_reg(c, inst, 0, 0);
712 src1 = get_src_reg(c, inst, 0, 1);
713 /* Calc delta X,Y by subtracting origin in r1 from the pixel
714 * centers.
715 */
716 if (mask & WRITEMASK_X) {
717 brw_ADD(p,
718 dst0,
719 retype(src0, BRW_REGISTER_TYPE_UW),
720 negate(r1));
721 }
722
723 if (mask & WRITEMASK_Y) {
724 brw_ADD(p,
725 dst1,
726 retype(src1, BRW_REGISTER_TYPE_UW),
727 negate(suboffset(r1,1)));
728
729 }
730 }
731
732 static void fire_fb_write( struct brw_wm_compile *c,
733 GLuint base_reg,
734 GLuint nr,
735 GLuint target,
736 GLuint eot)
737 {
738 struct brw_compile *p = &c->func;
739 /* Pass through control information:
740 */
741 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
742 {
743 brw_push_insn_state(p);
744 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
745 brw_MOV(p,
746 brw_message_reg(base_reg + 1),
747 brw_vec8_grf(1, 0));
748 brw_pop_insn_state(p);
749 }
750 /* Send framebuffer write message: */
751 brw_fb_WRITE(p,
752 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
753 base_reg,
754 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
755 target,
756 nr,
757 0,
758 eot);
759 }
760
761 static void emit_fb_write(struct brw_wm_compile *c,
762 const struct prog_instruction *inst)
763 {
764 struct brw_compile *p = &c->func;
765 int nr = 2;
766 int channel;
767 GLuint target, eot;
768 struct brw_reg src0;
769
770 /* Reserve a space for AA - may not be needed:
771 */
772 if (c->key.aa_dest_stencil_reg)
773 nr += 1;
774
775 brw_push_insn_state(p);
776 for (channel = 0; channel < 4; channel++) {
777 src0 = get_src_reg(c, inst, 0, channel);
778 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
779 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
780 brw_MOV(p, brw_message_reg(nr + channel), src0);
781 }
782 /* skip over the regs populated above: */
783 nr += 8;
784 brw_pop_insn_state(p);
785
786 if (c->key.source_depth_to_render_target) {
787 if (c->key.computes_depth) {
788 src0 = get_src_reg(c, inst, 2, 2);
789 brw_MOV(p, brw_message_reg(nr), src0);
790 }
791 else {
792 src0 = get_src_reg(c, inst, 1, 1);
793 brw_MOV(p, brw_message_reg(nr), src0);
794 }
795
796 nr += 2;
797 }
798
799 if (c->key.dest_depth_reg) {
800 const GLuint comp = c->key.dest_depth_reg / 2;
801 const GLuint off = c->key.dest_depth_reg % 2;
802
803 if (off != 0) {
804 /* XXX this code needs review/testing */
805 struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
806 struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
807
808 brw_push_insn_state(p);
809 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
810
811 brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
812 /* 2nd half? */
813 brw_MOV(p, brw_message_reg(nr+1), arg1_1);
814 brw_pop_insn_state(p);
815 }
816 else
817 {
818 struct brw_reg src = get_src_reg(c, inst, 1, 1);
819 brw_MOV(p, brw_message_reg(nr), src);
820 }
821 nr += 2;
822 }
823
824 target = INST_AUX_GET_TARGET(inst->Aux);
825 eot = inst->Aux & INST_AUX_EOT;
826 fire_fb_write(c, 0, nr, target, eot);
827 }
828
829 static void emit_pixel_w( struct brw_wm_compile *c,
830 const struct prog_instruction *inst)
831 {
832 struct brw_compile *p = &c->func;
833 GLuint mask = inst->DstReg.WriteMask;
834 if (mask & WRITEMASK_W) {
835 struct brw_reg dst, src0, delta0, delta1;
836 struct brw_reg interp3;
837
838 dst = get_dst_reg(c, inst, 3);
839 src0 = get_src_reg(c, inst, 0, 0);
840 delta0 = get_src_reg(c, inst, 1, 0);
841 delta1 = get_src_reg(c, inst, 1, 1);
842
843 interp3 = brw_vec1_grf(src0.nr+1, 4);
844 /* Calc 1/w - just linterp wpos[3] optimized by putting the
845 * result straight into a message reg.
846 */
847 brw_LINE(p, brw_null_reg(), interp3, delta0);
848 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
849
850 /* Calc w */
851 brw_math_16( p, dst,
852 BRW_MATH_FUNCTION_INV,
853 BRW_MATH_SATURATE_NONE,
854 2, brw_null_reg(),
855 BRW_MATH_PRECISION_FULL);
856 }
857 }
858
859 static void emit_linterp(struct brw_wm_compile *c,
860 const struct prog_instruction *inst)
861 {
862 struct brw_compile *p = &c->func;
863 GLuint mask = inst->DstReg.WriteMask;
864 struct brw_reg interp[4];
865 struct brw_reg dst, delta0, delta1;
866 struct brw_reg src0;
867 GLuint nr, i;
868
869 src0 = get_src_reg(c, inst, 0, 0);
870 delta0 = get_src_reg(c, inst, 1, 0);
871 delta1 = get_src_reg(c, inst, 1, 1);
872 nr = src0.nr;
873
874 interp[0] = brw_vec1_grf(nr, 0);
875 interp[1] = brw_vec1_grf(nr, 4);
876 interp[2] = brw_vec1_grf(nr+1, 0);
877 interp[3] = brw_vec1_grf(nr+1, 4);
878
879 for(i = 0; i < 4; i++ ) {
880 if (mask & (1<<i)) {
881 dst = get_dst_reg(c, inst, i);
882 brw_LINE(p, brw_null_reg(), interp[i], delta0);
883 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
884 }
885 }
886 }
887
888 static void emit_cinterp(struct brw_wm_compile *c,
889 const struct prog_instruction *inst)
890 {
891 struct brw_compile *p = &c->func;
892 GLuint mask = inst->DstReg.WriteMask;
893
894 struct brw_reg interp[4];
895 struct brw_reg dst, src0;
896 GLuint nr, i;
897
898 src0 = get_src_reg(c, inst, 0, 0);
899 nr = src0.nr;
900
901 interp[0] = brw_vec1_grf(nr, 0);
902 interp[1] = brw_vec1_grf(nr, 4);
903 interp[2] = brw_vec1_grf(nr+1, 0);
904 interp[3] = brw_vec1_grf(nr+1, 4);
905
906 for(i = 0; i < 4; i++ ) {
907 if (mask & (1<<i)) {
908 dst = get_dst_reg(c, inst, i);
909 brw_MOV(p, dst, suboffset(interp[i],3));
910 }
911 }
912 }
913
914 static void emit_pinterp(struct brw_wm_compile *c,
915 const struct prog_instruction *inst)
916 {
917 struct brw_compile *p = &c->func;
918 GLuint mask = inst->DstReg.WriteMask;
919
920 struct brw_reg interp[4];
921 struct brw_reg dst, delta0, delta1;
922 struct brw_reg src0, w;
923 GLuint nr, i;
924
925 src0 = get_src_reg(c, inst, 0, 0);
926 delta0 = get_src_reg(c, inst, 1, 0);
927 delta1 = get_src_reg(c, inst, 1, 1);
928 w = get_src_reg(c, inst, 2, 3);
929 nr = src0.nr;
930
931 interp[0] = brw_vec1_grf(nr, 0);
932 interp[1] = brw_vec1_grf(nr, 4);
933 interp[2] = brw_vec1_grf(nr+1, 0);
934 interp[3] = brw_vec1_grf(nr+1, 4);
935
936 for(i = 0; i < 4; i++ ) {
937 if (mask & (1<<i)) {
938 dst = get_dst_reg(c, inst, i);
939 brw_LINE(p, brw_null_reg(), interp[i], delta0);
940 brw_MAC(p, dst, suboffset(interp[i],1),
941 delta1);
942 brw_MUL(p, dst, dst, w);
943 }
944 }
945 }
946
947 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
948 static void emit_frontfacing(struct brw_wm_compile *c,
949 const struct prog_instruction *inst)
950 {
951 struct brw_compile *p = &c->func;
952 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
953 struct brw_reg dst;
954 GLuint mask = inst->DstReg.WriteMask;
955 int i;
956
957 for (i = 0; i < 4; i++) {
958 if (mask & (1<<i)) {
959 dst = get_dst_reg(c, inst, i);
960 brw_MOV(p, dst, brw_imm_f(0.0));
961 }
962 }
963
964 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
965 * us front face
966 */
967 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
968 for (i = 0; i < 4; i++) {
969 if (mask & (1<<i)) {
970 dst = get_dst_reg(c, inst, i);
971 brw_MOV(p, dst, brw_imm_f(1.0));
972 }
973 }
974 brw_set_predicate_control_flag_value(p, 0xff);
975 }
976
977 static void emit_xpd(struct brw_wm_compile *c,
978 const struct prog_instruction *inst)
979 {
980 int i;
981 struct brw_compile *p = &c->func;
982 GLuint mask = inst->DstReg.WriteMask;
983 for (i = 0; i < 4; i++) {
984 GLuint i2 = (i+2)%3;
985 GLuint i1 = (i+1)%3;
986 if (mask & (1<<i)) {
987 struct brw_reg src0, src1, dst;
988 dst = get_dst_reg(c, inst, i);
989 src0 = negate(get_src_reg(c, inst, 0, i2));
990 src1 = get_src_reg_imm(c, inst, 1, i1);
991 brw_MUL(p, brw_null_reg(), src0, src1);
992 src0 = get_src_reg(c, inst, 0, i1);
993 src1 = get_src_reg_imm(c, inst, 1, i2);
994 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
995 brw_MAC(p, dst, src0, src1);
996 brw_set_saturate(p, 0);
997 }
998 }
999 brw_set_saturate(p, 0);
1000 }
1001
1002 static void emit_dp3(struct brw_wm_compile *c,
1003 const struct prog_instruction *inst)
1004 {
1005 struct brw_reg src0[3], src1[3], dst;
1006 int i;
1007 struct brw_compile *p = &c->func;
1008 GLuint mask = inst->DstReg.WriteMask;
1009 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1010
1011 if (!(mask & WRITEMASK_XYZW))
1012 return;
1013
1014 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1015
1016 for (i = 0; i < 3; i++) {
1017 src0[i] = get_src_reg(c, inst, 0, i);
1018 src1[i] = get_src_reg_imm(c, inst, 1, i);
1019 }
1020
1021 dst = get_dst_reg(c, inst, dst_chan);
1022 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1023 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1024 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1025 brw_MAC(p, dst, src0[2], src1[2]);
1026 brw_set_saturate(p, 0);
1027 }
1028
1029 static void emit_dp4(struct brw_wm_compile *c,
1030 const struct prog_instruction *inst)
1031 {
1032 struct brw_reg src0[4], src1[4], dst;
1033 int i;
1034 struct brw_compile *p = &c->func;
1035 GLuint mask = inst->DstReg.WriteMask;
1036 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1037
1038 if (!(mask & WRITEMASK_XYZW))
1039 return;
1040
1041 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1042
1043 for (i = 0; i < 4; i++) {
1044 src0[i] = get_src_reg(c, inst, 0, i);
1045 src1[i] = get_src_reg_imm(c, inst, 1, i);
1046 }
1047 dst = get_dst_reg(c, inst, dst_chan);
1048 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1049 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1050 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
1051 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1052 brw_MAC(p, dst, src0[3], src1[3]);
1053 brw_set_saturate(p, 0);
1054 }
1055
1056 static void emit_dph(struct brw_wm_compile *c,
1057 const struct prog_instruction *inst)
1058 {
1059 struct brw_reg src0[4], src1[4], dst;
1060 int i;
1061 struct brw_compile *p = &c->func;
1062 GLuint mask = inst->DstReg.WriteMask;
1063 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1064
1065 if (!(mask & WRITEMASK_XYZW))
1066 return;
1067
1068 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1069
1070 for (i = 0; i < 4; i++) {
1071 src0[i] = get_src_reg(c, inst, 0, i);
1072 src1[i] = get_src_reg_imm(c, inst, 1, i);
1073 }
1074 dst = get_dst_reg(c, inst, dst_chan);
1075 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
1076 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
1077 brw_MAC(p, dst, src0[2], src1[2]);
1078 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1079 brw_ADD(p, dst, dst, src1[3]);
1080 brw_set_saturate(p, 0);
1081 }
1082
1083 /**
1084 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
1085 * Note that the result of the function is smeared across the dest
1086 * register's X, Y, Z and W channels (subject to writemasking of course).
1087 */
1088 static void emit_math1(struct brw_wm_compile *c,
1089 const struct prog_instruction *inst, GLuint func)
1090 {
1091 struct brw_compile *p = &c->func;
1092 struct brw_reg src0, dst;
1093 GLuint mask = inst->DstReg.WriteMask;
1094 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1095
1096 if (!(mask & WRITEMASK_XYZW))
1097 return;
1098
1099 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1100
1101 /* Get first component of source register */
1102 dst = get_dst_reg(c, inst, dst_chan);
1103 src0 = get_src_reg(c, inst, 0, 0);
1104
1105 brw_MOV(p, brw_message_reg(2), src0);
1106 brw_math(p,
1107 dst,
1108 func,
1109 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1110 2,
1111 brw_null_reg(),
1112 BRW_MATH_DATA_VECTOR,
1113 BRW_MATH_PRECISION_FULL);
1114 }
1115
1116 static void emit_rcp(struct brw_wm_compile *c,
1117 const struct prog_instruction *inst)
1118 {
1119 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
1120 }
1121
1122 static void emit_rsq(struct brw_wm_compile *c,
1123 const struct prog_instruction *inst)
1124 {
1125 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
1126 }
1127
1128 static void emit_sin(struct brw_wm_compile *c,
1129 const struct prog_instruction *inst)
1130 {
1131 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
1132 }
1133
1134 static void emit_cos(struct brw_wm_compile *c,
1135 const struct prog_instruction *inst)
1136 {
1137 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
1138 }
1139
1140 static void emit_ex2(struct brw_wm_compile *c,
1141 const struct prog_instruction *inst)
1142 {
1143 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
1144 }
1145
1146 static void emit_lg2(struct brw_wm_compile *c,
1147 const struct prog_instruction *inst)
1148 {
1149 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
1150 }
1151
1152 static void emit_arl(struct brw_wm_compile *c,
1153 const struct prog_instruction *inst)
1154 {
1155 struct brw_compile *p = &c->func;
1156 struct brw_reg src0, addr_reg;
1157 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1158 addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1159 BRW_ARF_ADDRESS, 0);
1160 src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
1161 brw_MOV(p, addr_reg, src0);
1162 brw_set_saturate(p, 0);
1163 }
1164
1165
1166 static void emit_min_max(struct brw_wm_compile *c,
1167 const struct prog_instruction *inst)
1168 {
1169 struct brw_compile *p = &c->func;
1170 const GLuint mask = inst->DstReg.WriteMask;
1171 const int mark = mark_tmps(c);
1172 int i;
1173 brw_push_insn_state(p);
1174 for (i = 0; i < 4; i++) {
1175 if (mask & (1<<i)) {
1176 struct brw_reg real_dst = get_dst_reg(c, inst, i);
1177 struct brw_reg src0 = get_src_reg(c, inst, 0, i);
1178 struct brw_reg src1 = get_src_reg(c, inst, 1, i);
1179 struct brw_reg dst;
1180 /* if dst==src0 or dst==src1 we need to use a temp reg */
1181 GLboolean use_temp = brw_same_reg(dst, src0) ||
1182 brw_same_reg(dst, src1);
1183 if (use_temp)
1184 dst = alloc_tmp(c);
1185 else
1186 dst = real_dst;
1187
1188 /*
1189 printf(" Min/max: dst %d src0 %d src1 %d\n",
1190 dst.nr, src0.nr, src1.nr);
1191 */
1192 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1193 brw_MOV(p, dst, src0);
1194 brw_set_saturate(p, 0);
1195
1196 if (inst->Opcode == OPCODE_MIN)
1197 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
1198 else
1199 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
1200
1201 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1202 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1203 brw_MOV(p, dst, src1);
1204 brw_set_saturate(p, 0);
1205 brw_set_predicate_control_flag_value(p, 0xff);
1206 if (use_temp)
1207 brw_MOV(p, real_dst, dst);
1208 }
1209 }
1210 brw_pop_insn_state(p);
1211 release_tmps(c, mark);
1212 }
1213
1214 static void emit_pow(struct brw_wm_compile *c,
1215 const struct prog_instruction *inst)
1216 {
1217 struct brw_compile *p = &c->func;
1218 struct brw_reg dst, src0, src1;
1219 GLuint mask = inst->DstReg.WriteMask;
1220 int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
1221
1222 if (!(mask & WRITEMASK_XYZW))
1223 return;
1224
1225 assert(is_power_of_two(mask & WRITEMASK_XYZW));
1226
1227 dst = get_dst_reg(c, inst, dst_chan);
1228 src0 = get_src_reg_imm(c, inst, 0, 0);
1229 src1 = get_src_reg_imm(c, inst, 1, 0);
1230
1231 brw_MOV(p, brw_message_reg(2), src0);
1232 brw_MOV(p, brw_message_reg(3), src1);
1233
1234 brw_math(p,
1235 dst,
1236 BRW_MATH_FUNCTION_POW,
1237 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1238 2,
1239 brw_null_reg(),
1240 BRW_MATH_DATA_VECTOR,
1241 BRW_MATH_PRECISION_FULL);
1242 }
1243
1244 static void emit_lrp(struct brw_wm_compile *c,
1245 const struct prog_instruction *inst)
1246 {
1247 struct brw_compile *p = &c->func;
1248 GLuint mask = inst->DstReg.WriteMask;
1249 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1250 int i;
1251 int mark = mark_tmps(c);
1252 for (i = 0; i < 4; i++) {
1253 if (mask & (1<<i)) {
1254 dst = get_dst_reg(c, inst, i);
1255 src0 = get_src_reg(c, inst, 0, i);
1256
1257 src1 = get_src_reg_imm(c, inst, 1, i);
1258
1259 if (src1.nr == dst.nr) {
1260 tmp1 = alloc_tmp(c);
1261 brw_MOV(p, tmp1, src1);
1262 } else
1263 tmp1 = src1;
1264
1265 src2 = get_src_reg(c, inst, 2, i);
1266 if (src2.nr == dst.nr) {
1267 tmp2 = alloc_tmp(c);
1268 brw_MOV(p, tmp2, src2);
1269 } else
1270 tmp2 = src2;
1271
1272 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1273 brw_MUL(p, brw_null_reg(), dst, tmp2);
1274 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1275 brw_MAC(p, dst, src0, tmp1);
1276 brw_set_saturate(p, 0);
1277 }
1278 release_tmps(c, mark);
1279 }
1280 }
1281
1282 /**
1283 * For GLSL shaders, this KIL will be unconditional.
1284 * It may be contained inside an IF/ENDIF structure of course.
1285 */
1286 static void emit_kil(struct brw_wm_compile *c)
1287 {
1288 struct brw_compile *p = &c->func;
1289 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1290 brw_push_insn_state(p);
1291 brw_set_mask_control(p, BRW_MASK_DISABLE);
1292 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1293 brw_AND(p, depth, c->emit_mask_reg, depth);
1294 brw_pop_insn_state(p);
1295 }
1296
1297 static void emit_mad(struct brw_wm_compile *c,
1298 const struct prog_instruction *inst)
1299 {
1300 struct brw_compile *p = &c->func;
1301 GLuint mask = inst->DstReg.WriteMask;
1302 struct brw_reg dst, src0, src1, src2;
1303 int i;
1304
1305 for (i = 0; i < 4; i++) {
1306 if (mask & (1<<i)) {
1307 dst = get_dst_reg(c, inst, i);
1308 src0 = get_src_reg(c, inst, 0, i);
1309 src1 = get_src_reg_imm(c, inst, 1, i);
1310 src2 = get_src_reg_imm(c, inst, 2, i);
1311 brw_MUL(p, dst, src0, src1);
1312
1313 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1314 brw_ADD(p, dst, dst, src2);
1315 brw_set_saturate(p, 0);
1316 }
1317 }
1318 }
1319
1320 static void emit_sop(struct brw_wm_compile *c,
1321 const struct prog_instruction *inst, GLuint cond)
1322 {
1323 struct brw_compile *p = &c->func;
1324 GLuint mask = inst->DstReg.WriteMask;
1325 struct brw_reg dst, src0, src1;
1326 int i;
1327
1328 for (i = 0; i < 4; i++) {
1329 if (mask & (1<<i)) {
1330 dst = get_dst_reg(c, inst, i);
1331 src0 = get_src_reg(c, inst, 0, i);
1332 src1 = get_src_reg_imm(c, inst, 1, i);
1333 brw_push_insn_state(p);
1334 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1335 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1336 brw_MOV(p, dst, brw_imm_f(0.0));
1337 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1338 brw_MOV(p, dst, brw_imm_f(1.0));
1339 brw_pop_insn_state(p);
1340 }
1341 }
1342 }
1343
1344 static void emit_slt(struct brw_wm_compile *c,
1345 const struct prog_instruction *inst)
1346 {
1347 emit_sop(c, inst, BRW_CONDITIONAL_L);
1348 }
1349
1350 static void emit_sle(struct brw_wm_compile *c,
1351 const struct prog_instruction *inst)
1352 {
1353 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1354 }
1355
1356 static void emit_sgt(struct brw_wm_compile *c,
1357 const struct prog_instruction *inst)
1358 {
1359 emit_sop(c, inst, BRW_CONDITIONAL_G);
1360 }
1361
1362 static void emit_sge(struct brw_wm_compile *c,
1363 const struct prog_instruction *inst)
1364 {
1365 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1366 }
1367
1368 static void emit_seq(struct brw_wm_compile *c,
1369 const struct prog_instruction *inst)
1370 {
1371 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1372 }
1373
1374 static void emit_sne(struct brw_wm_compile *c,
1375 const struct prog_instruction *inst)
1376 {
1377 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1378 }
1379
1380 static INLINE struct brw_reg high_words( struct brw_reg reg )
1381 {
1382 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1383 0, 8, 2 );
1384 }
1385
1386 static INLINE struct brw_reg low_words( struct brw_reg reg )
1387 {
1388 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1389 }
1390
1391 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1392 {
1393 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1394 }
1395
1396 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1397 {
1398 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1399 0, 16, 2 );
1400 }
1401
1402 /* One-, two- and three-dimensional Perlin noise, similar to the description
1403 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1404 static void noise1_sub( struct brw_wm_compile *c ) {
1405
1406 struct brw_compile *p = &c->func;
1407 struct brw_reg param,
1408 x0, x1, /* gradients at each end */
1409 t, tmp[ 2 ], /* float temporaries */
1410 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1411 int i;
1412 int mark = mark_tmps( c );
1413
1414 x0 = alloc_tmp( c );
1415 x1 = alloc_tmp( c );
1416 t = alloc_tmp( c );
1417 tmp[ 0 ] = alloc_tmp( c );
1418 tmp[ 1 ] = alloc_tmp( c );
1419 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1420 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1421 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1422 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1423 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1424
1425 param = lookup_tmp( c, mark - 2 );
1426
1427 brw_set_access_mode( p, BRW_ALIGN_1 );
1428
1429 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1430
1431 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1432 be hashed. Also compute the remainder (offset within the unit
1433 length), interleaved to reduce register dependency penalties. */
1434 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1435 brw_FRC( p, param, param );
1436 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1437 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1438 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1439
1440 /* We're now ready to perform the hashing. The two hashes are
1441 interleaved for performance. The hash function used is
1442 designed to rapidly achieve avalanche and require only 32x16
1443 bit multiplication, and 16-bit swizzles (which we get for
1444 free). We can't use immediate operands in the multiplies,
1445 because immediates are permitted only in src1 and the 16-bit
1446 factor is permitted only in src0. */
1447 for( i = 0; i < 2; i++ )
1448 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1449 for( i = 0; i < 2; i++ )
1450 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1451 high_words( itmp[ i ] ) );
1452 for( i = 0; i < 2; i++ )
1453 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1454 for( i = 0; i < 2; i++ )
1455 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1456 high_words( itmp[ i ] ) );
1457 for( i = 0; i < 2; i++ )
1458 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1459 for( i = 0; i < 2; i++ )
1460 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1461 high_words( itmp[ i ] ) );
1462
1463 /* Now we want to initialise the two gradients based on the
1464 hashes. Format conversion from signed integer to float leaves
1465 everything scaled too high by a factor of pow( 2, 31 ), but
1466 we correct for that right at the end. */
1467 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1468 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1469 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1470
1471 brw_MUL( p, x0, x0, param );
1472 brw_MUL( p, x1, x1, t );
1473
1474 /* We interpolate between the gradients using the polynomial
1475 6t^5 - 15t^4 + 10t^3 (Perlin). */
1476 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1477 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1478 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1479 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1480 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1481 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1482 pipeline */
1483 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1484 brw_MUL( p, param, tmp[ 0 ], param );
1485 brw_MUL( p, x1, x1, param );
1486 brw_ADD( p, x0, x0, x1 );
1487 /* scale by pow( 2, -30 ), to compensate for the format conversion
1488 above and an extra factor of 2 so that a single gradient covers
1489 the [-1,1] range */
1490 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1491
1492 release_tmps( c, mark );
1493 }
1494
1495 static void emit_noise1( struct brw_wm_compile *c,
1496 const struct prog_instruction *inst )
1497 {
1498 struct brw_compile *p = &c->func;
1499 struct brw_reg src, param, dst;
1500 GLuint mask = inst->DstReg.WriteMask;
1501 int i;
1502 int mark = mark_tmps( c );
1503
1504 assert( mark == 0 );
1505
1506 src = get_src_reg( c, inst, 0, 0 );
1507
1508 param = alloc_tmp( c );
1509
1510 brw_MOV( p, param, src );
1511
1512 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1513
1514 /* Fill in the result: */
1515 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1516 for (i = 0 ; i < 4; i++) {
1517 if (mask & (1<<i)) {
1518 dst = get_dst_reg(c, inst, i);
1519 brw_MOV( p, dst, param );
1520 }
1521 }
1522 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1523 brw_set_saturate( p, 0 );
1524
1525 release_tmps( c, mark );
1526 }
1527
1528 static void noise2_sub( struct brw_wm_compile *c ) {
1529
1530 struct brw_compile *p = &c->func;
1531 struct brw_reg param0, param1,
1532 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1533 t, tmp[ 4 ], /* float temporaries */
1534 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1535 int i;
1536 int mark = mark_tmps( c );
1537
1538 x0y0 = alloc_tmp( c );
1539 x0y1 = alloc_tmp( c );
1540 x1y0 = alloc_tmp( c );
1541 x1y1 = alloc_tmp( c );
1542 t = alloc_tmp( c );
1543 for( i = 0; i < 4; i++ ) {
1544 tmp[ i ] = alloc_tmp( c );
1545 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1546 }
1547 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1548 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1549 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1550
1551 param0 = lookup_tmp( c, mark - 3 );
1552 param1 = lookup_tmp( c, mark - 2 );
1553
1554 brw_set_access_mode( p, BRW_ALIGN_1 );
1555
1556 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1557 be hashed. Also compute the remainders (offsets within the unit
1558 square), interleaved to reduce register dependency penalties. */
1559 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1560 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1561 brw_FRC( p, param0, param0 );
1562 brw_FRC( p, param1, param1 );
1563 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1564 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1565 low_words( itmp[ 1 ] ) );
1566 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1567 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1568 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1569 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1570 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1571
1572 /* We're now ready to perform the hashing. The four hashes are
1573 interleaved for performance. The hash function used is
1574 designed to rapidly achieve avalanche and require only 32x16
1575 bit multiplication, and 16-bit swizzles (which we get for
1576 free). We can't use immediate operands in the multiplies,
1577 because immediates are permitted only in src1 and the 16-bit
1578 factor is permitted only in src0. */
1579 for( i = 0; i < 4; i++ )
1580 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1581 for( i = 0; i < 4; i++ )
1582 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1583 high_words( itmp[ i ] ) );
1584 for( i = 0; i < 4; i++ )
1585 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1586 for( i = 0; i < 4; i++ )
1587 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1588 high_words( itmp[ i ] ) );
1589 for( i = 0; i < 4; i++ )
1590 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1591 for( i = 0; i < 4; i++ )
1592 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1593 high_words( itmp[ i ] ) );
1594
1595 /* Now we want to initialise the four gradients based on the
1596 hashes. Format conversion from signed integer to float leaves
1597 everything scaled too high by a factor of pow( 2, 15 ), but
1598 we correct for that right at the end. */
1599 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1600 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1601 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1602 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1603 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1604
1605 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1606 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1607 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1608 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1609
1610 brw_MUL( p, x1y0, x1y0, t );
1611 brw_MUL( p, x1y1, x1y1, t );
1612 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1613 brw_MUL( p, x0y0, x0y0, param0 );
1614 brw_MUL( p, x0y1, x0y1, param0 );
1615
1616 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1617 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1618 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1619 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1620
1621 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1622 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1623 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1624 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1625
1626 /* We interpolate between the gradients using the polynomial
1627 6t^5 - 15t^4 + 10t^3 (Perlin). */
1628 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1629 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1630 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1631 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1632 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1633 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1634 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1635 pipeline */
1636 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1637 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1638 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1639 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1640 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1641 pipeline */
1642 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1643 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1644 brw_MUL( p, param0, tmp[ 0 ], param0 );
1645 brw_MUL( p, param1, tmp[ 1 ], param1 );
1646
1647 /* Here we interpolate in the y dimension... */
1648 brw_MUL( p, x0y1, x0y1, param1 );
1649 brw_MUL( p, x1y1, x1y1, param1 );
1650 brw_ADD( p, x0y0, x0y0, x0y1 );
1651 brw_ADD( p, x1y0, x1y0, x1y1 );
1652
1653 /* And now in x. There are horrible register dependencies here,
1654 but we have nothing else to do. */
1655 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1656 brw_MUL( p, x1y0, x1y0, param0 );
1657 brw_ADD( p, x0y0, x0y0, x1y0 );
1658
1659 /* scale by pow( 2, -15 ), as described above */
1660 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1661
1662 release_tmps( c, mark );
1663 }
1664
1665 static void emit_noise2( struct brw_wm_compile *c,
1666 const struct prog_instruction *inst )
1667 {
1668 struct brw_compile *p = &c->func;
1669 struct brw_reg src0, src1, param0, param1, dst;
1670 GLuint mask = inst->DstReg.WriteMask;
1671 int i;
1672 int mark = mark_tmps( c );
1673
1674 assert( mark == 0 );
1675
1676 src0 = get_src_reg( c, inst, 0, 0 );
1677 src1 = get_src_reg( c, inst, 0, 1 );
1678
1679 param0 = alloc_tmp( c );
1680 param1 = alloc_tmp( c );
1681
1682 brw_MOV( p, param0, src0 );
1683 brw_MOV( p, param1, src1 );
1684
1685 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1686
1687 /* Fill in the result: */
1688 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1689 for (i = 0 ; i < 4; i++) {
1690 if (mask & (1<<i)) {
1691 dst = get_dst_reg(c, inst, i);
1692 brw_MOV( p, dst, param0 );
1693 }
1694 }
1695 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1696 brw_set_saturate( p, 0 );
1697
1698 release_tmps( c, mark );
1699 }
1700
1701 /**
1702 * The three-dimensional case is much like the one- and two- versions above,
1703 * but since the number of corners is rapidly growing we now pack 16 16-bit
1704 * hashes into each register to extract more parallelism from the EUs.
1705 */
1706 static void noise3_sub( struct brw_wm_compile *c ) {
1707
1708 struct brw_compile *p = &c->func;
1709 struct brw_reg param0, param1, param2,
1710 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1711 xi, yi, zi, /* interpolation coefficients */
1712 t, tmp[ 8 ], /* float temporaries */
1713 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1714 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1715 int i;
1716 int mark = mark_tmps( c );
1717
1718 x0y0 = alloc_tmp( c );
1719 x0y1 = alloc_tmp( c );
1720 x1y0 = alloc_tmp( c );
1721 x1y1 = alloc_tmp( c );
1722 xi = alloc_tmp( c );
1723 yi = alloc_tmp( c );
1724 zi = alloc_tmp( c );
1725 t = alloc_tmp( c );
1726 for( i = 0; i < 8; i++ ) {
1727 tmp[ i ] = alloc_tmp( c );
1728 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1729 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1730 }
1731
1732 param0 = lookup_tmp( c, mark - 4 );
1733 param1 = lookup_tmp( c, mark - 3 );
1734 param2 = lookup_tmp( c, mark - 2 );
1735
1736 brw_set_access_mode( p, BRW_ALIGN_1 );
1737
1738 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1739 be hashed. Also compute the remainders (offsets within the unit
1740 cube), interleaved to reduce register dependency penalties. */
1741 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1742 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1743 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1744 brw_FRC( p, param0, param0 );
1745 brw_FRC( p, param1, param1 );
1746 brw_FRC( p, param2, param2 );
1747 /* Since we now have only 16 bits of precision in the hash, we must
1748 be more careful about thorough mixing to maintain entropy as we
1749 squash the input vector into a small scalar. */
1750 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1751 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1752 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1753 brw_imm_uw( 0x9B93 ) );
1754 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1755 brw_imm_uw( 0xBC8F ) );
1756
1757 /* Temporarily disable the execution mask while we work with ExecSize=16
1758 channels (the mask is set for ExecSize=8 and is probably incorrect).
1759 Although this might cause execution of unwanted channels, the code
1760 writes only to temporary registers and has no side effects, so
1761 disabling the mask is harmless. */
1762 brw_push_insn_state( p );
1763 brw_set_mask_control( p, BRW_MASK_DISABLE );
1764 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1765 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1766 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1767
1768 /* We're now ready to perform the hashing. The eight hashes are
1769 interleaved for performance. The hash function used is
1770 designed to rapidly achieve avalanche and require only 16x16
1771 bit multiplication, and 8-bit swizzles (which we get for
1772 free). */
1773 for( i = 0; i < 4; i++ )
1774 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1775 for( i = 0; i < 4; i++ )
1776 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1777 odd_bytes( wtmp[ i ] ) );
1778 for( i = 0; i < 4; i++ )
1779 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1780 for( i = 0; i < 4; i++ )
1781 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1782 odd_bytes( wtmp[ i ] ) );
1783 brw_pop_insn_state( p );
1784
1785 /* Now we want to initialise the four rear gradients based on the
1786 hashes. Format conversion from signed integer to float leaves
1787 everything scaled too high by a factor of pow( 2, 15 ), but
1788 we correct for that right at the end. */
1789 /* x component */
1790 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1791 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1792 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1793 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1794 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1795
1796 brw_push_insn_state( p );
1797 brw_set_mask_control( p, BRW_MASK_DISABLE );
1798 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1799 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1800 brw_pop_insn_state( p );
1801
1802 brw_MUL( p, x1y0, x1y0, t );
1803 brw_MUL( p, x1y1, x1y1, t );
1804 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1805 brw_MUL( p, x0y0, x0y0, param0 );
1806 brw_MUL( p, x0y1, x0y1, param0 );
1807
1808 /* y component */
1809 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1810 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1811 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1812 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1813
1814 brw_push_insn_state( p );
1815 brw_set_mask_control( p, BRW_MASK_DISABLE );
1816 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1817 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1818 brw_pop_insn_state( p );
1819
1820 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1821 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1822 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1823 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1824 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1825
1826 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1827 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1828 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1829 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1830
1831 /* z component */
1832 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1833 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1834 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1835 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1836
1837 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1838 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1839 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1840 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1841
1842 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1843 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1844 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1845 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1846
1847 /* We interpolate between the gradients using the polynomial
1848 6t^5 - 15t^4 + 10t^3 (Perlin). */
1849 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1850 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1851 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1852 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1853 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1854 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1855 brw_MUL( p, xi, xi, param0 );
1856 brw_MUL( p, yi, yi, param1 );
1857 brw_MUL( p, zi, zi, param2 );
1858 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1859 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1860 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1861 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1862 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1863 brw_MUL( p, xi, xi, param0 );
1864 brw_MUL( p, yi, yi, param1 );
1865 brw_MUL( p, zi, zi, param2 );
1866 brw_MUL( p, xi, xi, param0 );
1867 brw_MUL( p, yi, yi, param1 );
1868 brw_MUL( p, zi, zi, param2 );
1869 brw_MUL( p, xi, xi, param0 );
1870 brw_MUL( p, yi, yi, param1 );
1871 brw_MUL( p, zi, zi, param2 );
1872
1873 /* Here we interpolate in the y dimension... */
1874 brw_MUL( p, x0y1, x0y1, yi );
1875 brw_MUL( p, x1y1, x1y1, yi );
1876 brw_ADD( p, x0y0, x0y0, x0y1 );
1877 brw_ADD( p, x1y0, x1y0, x1y1 );
1878
1879 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1880 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1881 brw_MUL( p, x1y0, x1y0, xi );
1882 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1883
1884 /* Now do the same thing for the front four gradients... */
1885 /* x component */
1886 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1887 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1888 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1889 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1890
1891 brw_push_insn_state( p );
1892 brw_set_mask_control( p, BRW_MASK_DISABLE );
1893 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1894 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1895 brw_pop_insn_state( p );
1896
1897 brw_MUL( p, x1y0, x1y0, t );
1898 brw_MUL( p, x1y1, x1y1, t );
1899 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1900 brw_MUL( p, x0y0, x0y0, param0 );
1901 brw_MUL( p, x0y1, x0y1, param0 );
1902
1903 /* y component */
1904 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1905 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1906 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1907 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1908
1909 brw_push_insn_state( p );
1910 brw_set_mask_control( p, BRW_MASK_DISABLE );
1911 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1912 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1913 brw_pop_insn_state( p );
1914
1915 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1916 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1917 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1918 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1919 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1920
1921 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1922 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1923 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1924 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1925
1926 /* z component */
1927 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1928 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1929 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1930 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1931
1932 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1933 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1934 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1935 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1936
1937 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1938 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1939 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1940 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1941
1942 /* The interpolation coefficients are still around from last time, so
1943 again interpolate in the y dimension... */
1944 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1945 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1946 brw_MUL( p, x0y1, x0y1, yi );
1947 brw_MUL( p, x1y1, x1y1, yi );
1948 brw_ADD( p, x0y0, x0y0, x0y1 );
1949 brw_ADD( p, x1y0, x1y0, x1y1 );
1950
1951 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1952 time put the front face in tmp[ 1 ] and we're nearly there... */
1953 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1954 brw_MUL( p, x1y0, x1y0, xi );
1955 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1956
1957 /* The final interpolation, in the z dimension: */
1958 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1959 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1960 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1961
1962 /* scale by pow( 2, -15 ), as described above */
1963 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1964
1965 release_tmps( c, mark );
1966 }
1967
1968 static void emit_noise3( struct brw_wm_compile *c,
1969 const struct prog_instruction *inst )
1970 {
1971 struct brw_compile *p = &c->func;
1972 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1973 GLuint mask = inst->DstReg.WriteMask;
1974 int i;
1975 int mark = mark_tmps( c );
1976
1977 assert( mark == 0 );
1978
1979 src0 = get_src_reg( c, inst, 0, 0 );
1980 src1 = get_src_reg( c, inst, 0, 1 );
1981 src2 = get_src_reg( c, inst, 0, 2 );
1982
1983 param0 = alloc_tmp( c );
1984 param1 = alloc_tmp( c );
1985 param2 = alloc_tmp( c );
1986
1987 brw_MOV( p, param0, src0 );
1988 brw_MOV( p, param1, src1 );
1989 brw_MOV( p, param2, src2 );
1990
1991 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1992
1993 /* Fill in the result: */
1994 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1995 for (i = 0 ; i < 4; i++) {
1996 if (mask & (1<<i)) {
1997 dst = get_dst_reg(c, inst, i);
1998 brw_MOV( p, dst, param0 );
1999 }
2000 }
2001 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2002 brw_set_saturate( p, 0 );
2003
2004 release_tmps( c, mark );
2005 }
2006
2007 /**
2008 * For the four-dimensional case, the little micro-optimisation benefits
2009 * we obtain by unrolling all the loops aren't worth the massive bloat it
2010 * now causes. Instead, we loop twice around performing a similar operation
2011 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
2012 * code to glue it all together.
2013 */
2014 static void noise4_sub( struct brw_wm_compile *c )
2015 {
2016 struct brw_compile *p = &c->func;
2017 struct brw_reg param[ 4 ],
2018 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
2019 w0, /* noise for the w=0 cube */
2020 floors[ 2 ], /* integer coordinates of base corner of hypercube */
2021 interp[ 4 ], /* interpolation coefficients */
2022 t, tmp[ 8 ], /* float temporaries */
2023 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
2024 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
2025 int i, j;
2026 int mark = mark_tmps( c );
2027 GLuint loop, origin;
2028
2029 x0y0 = alloc_tmp( c );
2030 x0y1 = alloc_tmp( c );
2031 x1y0 = alloc_tmp( c );
2032 x1y1 = alloc_tmp( c );
2033 t = alloc_tmp( c );
2034 w0 = alloc_tmp( c );
2035 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2036 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
2037
2038 for( i = 0; i < 4; i++ ) {
2039 param[ i ] = lookup_tmp( c, mark - 5 + i );
2040 interp[ i ] = alloc_tmp( c );
2041 }
2042
2043 for( i = 0; i < 8; i++ ) {
2044 tmp[ i ] = alloc_tmp( c );
2045 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
2046 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
2047 }
2048
2049 brw_set_access_mode( p, BRW_ALIGN_1 );
2050
2051 /* We only want 16 bits of precision from the integral part of each
2052 co-ordinate, but unfortunately the RNDD semantics would saturate
2053 at 16 bits if we performed the operation directly to a 16-bit
2054 destination. Therefore, we round to 32-bit temporaries where
2055 appropriate, and then store only the lower 16 bits. */
2056 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
2057 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
2058 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
2059 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
2060 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
2061 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
2062
2063 /* Modify the flag register here, because the side effect is useful
2064 later (see below). We know for certain that all flags will be
2065 cleared, since the FRC instruction cannot possibly generate
2066 negative results. Even for exceptional inputs (infinities, denormals,
2067 NaNs), the architecture guarantees that the L conditional is false. */
2068 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
2069 brw_FRC( p, param[ 0 ], param[ 0 ] );
2070 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2071 for( i = 1; i < 4; i++ )
2072 brw_FRC( p, param[ i ], param[ i ] );
2073
2074 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
2075 of all. */
2076 for( i = 0; i < 4; i++ )
2077 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
2078 for( i = 0; i < 4; i++ )
2079 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
2080 for( i = 0; i < 4; i++ )
2081 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2082 for( i = 0; i < 4; i++ )
2083 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
2084 for( j = 0; j < 3; j++ )
2085 for( i = 0; i < 4; i++ )
2086 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
2087
2088 /* Mark the current address, as it will be a jump destination. The
2089 following code will be executed twice: first, with the flag
2090 register clear indicating the w=0 case, and second with flags
2091 set for w=1. */
2092 loop = p->nr_insn;
2093
2094 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
2095 be hashed. Since we have only 16 bits of precision in the hash, we
2096 must be careful about thorough mixing to maintain entropy as we
2097 squash the input vector into a small scalar. */
2098 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
2099 brw_imm_uw( 0xBC8F ) );
2100 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
2101 brw_imm_uw( 0xD0BD ) );
2102 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
2103 brw_imm_uw( 0x9B93 ) );
2104 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
2105 brw_imm_uw( 0xA359 ) );
2106 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
2107 brw_imm_uw( 0xBC8F ) );
2108
2109 /* Temporarily disable the execution mask while we work with ExecSize=16
2110 channels (the mask is set for ExecSize=8 and is probably incorrect).
2111 Although this might cause execution of unwanted channels, the code
2112 writes only to temporary registers and has no side effects, so
2113 disabling the mask is harmless. */
2114 brw_push_insn_state( p );
2115 brw_set_mask_control( p, BRW_MASK_DISABLE );
2116 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
2117 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
2118 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
2119
2120 /* We're now ready to perform the hashing. The eight hashes are
2121 interleaved for performance. The hash function used is
2122 designed to rapidly achieve avalanche and require only 16x16
2123 bit multiplication, and 8-bit swizzles (which we get for
2124 free). */
2125 for( i = 0; i < 4; i++ )
2126 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
2127 for( i = 0; i < 4; i++ )
2128 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2129 odd_bytes( wtmp[ i ] ) );
2130 for( i = 0; i < 4; i++ )
2131 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
2132 for( i = 0; i < 4; i++ )
2133 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
2134 odd_bytes( wtmp[ i ] ) );
2135 brw_pop_insn_state( p );
2136
2137 /* Now we want to initialise the four rear gradients based on the
2138 hashes. Format conversion from signed integer to float leaves
2139 everything scaled too high by a factor of pow( 2, 15 ), but
2140 we correct for that right at the end. */
2141 /* x component */
2142 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2143 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
2144 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
2145 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
2146 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
2147
2148 brw_push_insn_state( p );
2149 brw_set_mask_control( p, BRW_MASK_DISABLE );
2150 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2151 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2152 brw_pop_insn_state( p );
2153
2154 brw_MUL( p, x1y0, x1y0, t );
2155 brw_MUL( p, x1y1, x1y1, t );
2156 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2157 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2158 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2159
2160 /* y component */
2161 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2162 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2163 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2164 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2165
2166 brw_push_insn_state( p );
2167 brw_set_mask_control( p, BRW_MASK_DISABLE );
2168 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2169 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2170 brw_pop_insn_state( p );
2171
2172 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2173 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2174 /* prepare t for the w component (used below): w the first time through
2175 the loop; w - 1 the second time) */
2176 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2177 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2178 p->current->header.predicate_inverse = 1;
2179 brw_MOV( p, t, param[ 3 ] );
2180 p->current->header.predicate_inverse = 0;
2181 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2182 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2183 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2184
2185 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2186 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2187 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2188 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2189
2190 /* z component */
2191 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2192 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2193 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2194 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2195
2196 brw_push_insn_state( p );
2197 brw_set_mask_control( p, BRW_MASK_DISABLE );
2198 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2199 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2200 brw_pop_insn_state( p );
2201
2202 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2203 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2204 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2205 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2206
2207 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2208 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2209 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2210 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2211
2212 /* w component */
2213 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2214 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2215 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2216 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2217
2218 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2219 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2220 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2221 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2222 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2223
2224 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2225 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2226 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2227 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2228
2229 /* Here we interpolate in the y dimension... */
2230 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2231 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2232 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2233 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2234 brw_ADD( p, x0y0, x0y0, x0y1 );
2235 brw_ADD( p, x1y0, x1y0, x1y1 );
2236
2237 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2238 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2239 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2240 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2241
2242 /* Now do the same thing for the front four gradients... */
2243 /* x component */
2244 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2245 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2246 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2247 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2248
2249 brw_push_insn_state( p );
2250 brw_set_mask_control( p, BRW_MASK_DISABLE );
2251 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2252 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2253 brw_pop_insn_state( p );
2254
2255 brw_MUL( p, x1y0, x1y0, t );
2256 brw_MUL( p, x1y1, x1y1, t );
2257 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2258 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2259 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2260
2261 /* y component */
2262 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2263 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2264 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2265 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2266
2267 brw_push_insn_state( p );
2268 brw_set_mask_control( p, BRW_MASK_DISABLE );
2269 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2270 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2271 brw_pop_insn_state( p );
2272
2273 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2274 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2275 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2276 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2277 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2278
2279 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2280 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2281 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2282 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2283
2284 /* z component */
2285 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2286 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2287 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2288 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2289
2290 brw_push_insn_state( p );
2291 brw_set_mask_control( p, BRW_MASK_DISABLE );
2292 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2293 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2294 brw_pop_insn_state( p );
2295
2296 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2297 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2298 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2299 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2300 /* prepare t for the w component (used below): w the first time through
2301 the loop; w - 1 the second time) */
2302 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2303 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2304 p->current->header.predicate_inverse = 1;
2305 brw_MOV( p, t, param[ 3 ] );
2306 p->current->header.predicate_inverse = 0;
2307 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2308
2309 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2310 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2311 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2312 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2313
2314 /* w component */
2315 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2316 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2317 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2318 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2319
2320 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2321 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2322 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2323 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2324
2325 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2326 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2327 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2328 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2329
2330 /* Interpolate in the y dimension: */
2331 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2332 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2333 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2334 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2335 brw_ADD( p, x0y0, x0y0, x0y1 );
2336 brw_ADD( p, x1y0, x1y0, x1y1 );
2337
2338 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2339 time put the front face in tmp[ 1 ] and we're nearly there... */
2340 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2341 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2342 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2343
2344 /* Another interpolation, in the z dimension: */
2345 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2346 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2347 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2348
2349 /* Exit the loop if we've computed both cubes... */
2350 origin = p->nr_insn;
2351 brw_push_insn_state( p );
2352 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2353 brw_set_mask_control( p, BRW_MASK_DISABLE );
2354 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2355 brw_pop_insn_state( p );
2356
2357 /* Save the result for the w=0 case, and increment the w coordinate: */
2358 brw_MOV( p, w0, tmp[ 0 ] );
2359 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2360 brw_imm_uw( 1 ) );
2361
2362 /* Loop around for the other cube. Explicitly set the flag register
2363 (unfortunately we must spend an extra instruction to do this: we
2364 can't rely on a side effect of the previous MOV or ADD because
2365 conditional modifiers which are normally true might be false in
2366 exceptional circumstances, e.g. given a NaN input; the add to
2367 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2368 brw_push_insn_state( p );
2369 brw_set_mask_control( p, BRW_MASK_DISABLE );
2370 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2371 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2372 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2373 brw_pop_insn_state( p );
2374
2375 /* Patch the previous conditional branch now that we know the
2376 destination address. */
2377 brw_set_src1( p->store + origin,
2378 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2379
2380 /* The very last interpolation. */
2381 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2382 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2383 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2384
2385 /* scale by pow( 2, -15 ), as described above */
2386 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2387
2388 release_tmps( c, mark );
2389 }
2390
2391 static void emit_noise4( struct brw_wm_compile *c,
2392 const struct prog_instruction *inst )
2393 {
2394 struct brw_compile *p = &c->func;
2395 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2396 GLuint mask = inst->DstReg.WriteMask;
2397 int i;
2398 int mark = mark_tmps( c );
2399
2400 assert( mark == 0 );
2401
2402 src0 = get_src_reg( c, inst, 0, 0 );
2403 src1 = get_src_reg( c, inst, 0, 1 );
2404 src2 = get_src_reg( c, inst, 0, 2 );
2405 src3 = get_src_reg( c, inst, 0, 3 );
2406
2407 param0 = alloc_tmp( c );
2408 param1 = alloc_tmp( c );
2409 param2 = alloc_tmp( c );
2410 param3 = alloc_tmp( c );
2411
2412 brw_MOV( p, param0, src0 );
2413 brw_MOV( p, param1, src1 );
2414 brw_MOV( p, param2, src2 );
2415 brw_MOV( p, param3, src3 );
2416
2417 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2418
2419 /* Fill in the result: */
2420 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2421 for (i = 0 ; i < 4; i++) {
2422 if (mask & (1<<i)) {
2423 dst = get_dst_reg(c, inst, i);
2424 brw_MOV( p, dst, param0 );
2425 }
2426 }
2427 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2428 brw_set_saturate( p, 0 );
2429
2430 release_tmps( c, mark );
2431 }
2432
2433 static void emit_wpos_xy(struct brw_wm_compile *c,
2434 const struct prog_instruction *inst)
2435 {
2436 struct brw_compile *p = &c->func;
2437 GLuint mask = inst->DstReg.WriteMask;
2438 struct brw_reg src0[2], dst[2];
2439
2440 dst[0] = get_dst_reg(c, inst, 0);
2441 dst[1] = get_dst_reg(c, inst, 1);
2442
2443 src0[0] = get_src_reg(c, inst, 0, 0);
2444 src0[1] = get_src_reg(c, inst, 0, 1);
2445
2446 /* Calculate the pixel offset from window bottom left into destination
2447 * X and Y channels.
2448 */
2449 if (mask & WRITEMASK_X) {
2450 /* X' = X - origin_x */
2451 brw_ADD(p,
2452 dst[0],
2453 retype(src0[0], BRW_REGISTER_TYPE_W),
2454 brw_imm_d(0 - c->key.origin_x));
2455 }
2456
2457 if (mask & WRITEMASK_Y) {
2458 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2459 brw_ADD(p,
2460 dst[1],
2461 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2462 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2463 }
2464 }
2465
2466 /* TODO
2467 BIAS on SIMD8 not working yet...
2468 */
2469 static void emit_txb(struct brw_wm_compile *c,
2470 const struct prog_instruction *inst)
2471 {
2472 struct brw_compile *p = &c->func;
2473 struct brw_reg dst[4], src[4], payload_reg;
2474 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2475 const GLuint unit = inst->TexSrcUnit;
2476 GLuint i;
2477 GLuint msg_type;
2478
2479 assert(unit < BRW_MAX_TEX_UNIT);
2480
2481 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2482
2483 for (i = 0; i < 4; i++)
2484 dst[i] = get_dst_reg(c, inst, i);
2485 for (i = 0; i < 4; i++)
2486 src[i] = get_src_reg(c, inst, 0, i);
2487
2488 switch (inst->TexSrcTarget) {
2489 case TEXTURE_1D_INDEX:
2490 brw_MOV(p, brw_message_reg(2), src[0]); /* s coord */
2491 brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); /* t coord */
2492 brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); /* r coord */
2493 break;
2494 case TEXTURE_2D_INDEX:
2495 case TEXTURE_RECT_INDEX:
2496 brw_MOV(p, brw_message_reg(2), src[0]);
2497 brw_MOV(p, brw_message_reg(3), src[1]);
2498 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2499 break;
2500 case TEXTURE_3D_INDEX:
2501 case TEXTURE_CUBE_INDEX:
2502 brw_MOV(p, brw_message_reg(2), src[0]);
2503 brw_MOV(p, brw_message_reg(3), src[1]);
2504 brw_MOV(p, brw_message_reg(4), src[2]);
2505 break;
2506 default:
2507 /* invalid target */
2508 abort();
2509 }
2510 brw_MOV(p, brw_message_reg(5), src[3]); /* bias */
2511 brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); /* ref (unused?) */
2512
2513 if (BRW_IS_IGDNG(p->brw)) {
2514 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
2515 } else {
2516 /* Does it work well on SIMD8? */
2517 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2518 }
2519
2520 brw_SAMPLE(p,
2521 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2522 1, /* msg_reg_nr */
2523 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2524 SURF_INDEX_TEXTURE(unit),
2525 unit, /* sampler */
2526 inst->DstReg.WriteMask, /* writemask */
2527 msg_type, /* msg_type */
2528 4, /* response_length */
2529 4, /* msg_length */
2530 0, /* eot */
2531 1,
2532 BRW_SAMPLER_SIMD_MODE_SIMD8);
2533 }
2534
2535
2536 static void emit_tex(struct brw_wm_compile *c,
2537 const struct prog_instruction *inst)
2538 {
2539 struct brw_compile *p = &c->func;
2540 struct brw_reg dst[4], src[4], payload_reg;
2541 /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
2542 const GLuint unit = inst->TexSrcUnit;
2543 GLuint msg_len;
2544 GLuint i, nr;
2545 GLuint emit;
2546 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2547 GLuint msg_type;
2548
2549 assert(unit < BRW_MAX_TEX_UNIT);
2550
2551 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2552
2553 for (i = 0; i < 4; i++)
2554 dst[i] = get_dst_reg(c, inst, i);
2555 for (i = 0; i < 4; i++)
2556 src[i] = get_src_reg(c, inst, 0, i);
2557
2558 switch (inst->TexSrcTarget) {
2559 case TEXTURE_1D_INDEX:
2560 emit = WRITEMASK_X;
2561 nr = 1;
2562 break;
2563 case TEXTURE_2D_INDEX:
2564 case TEXTURE_RECT_INDEX:
2565 emit = WRITEMASK_XY;
2566 nr = 2;
2567 break;
2568 case TEXTURE_3D_INDEX:
2569 case TEXTURE_CUBE_INDEX:
2570 emit = WRITEMASK_XYZ;
2571 nr = 3;
2572 break;
2573 default:
2574 /* invalid target */
2575 abort();
2576 }
2577 msg_len = 1;
2578
2579 /* move/load S, T, R coords */
2580 for (i = 0; i < nr; i++) {
2581 static const GLuint swz[4] = {0,1,2,2};
2582 if (emit & (1<<i))
2583 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2584 else
2585 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2586 msg_len += 1;
2587 }
2588
2589 if (shadow) {
2590 brw_MOV(p, brw_message_reg(5), brw_imm_f(0)); /* lod / bias */
2591 brw_MOV(p, brw_message_reg(6), src[2]); /* ref value / R coord */
2592 }
2593
2594 if (BRW_IS_IGDNG(p->brw)) {
2595 if (shadow)
2596 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
2597 else
2598 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
2599 } else {
2600 /* Does it work for shadow on SIMD8 ? */
2601 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2602 }
2603
2604 brw_SAMPLE(p,
2605 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
2606 1, /* msg_reg_nr */
2607 retype(payload_reg, BRW_REGISTER_TYPE_UW), /* src0 */
2608 SURF_INDEX_TEXTURE(unit),
2609 unit, /* sampler */
2610 inst->DstReg.WriteMask, /* writemask */
2611 msg_type, /* msg_type */
2612 4, /* response_length */
2613 shadow ? 6 : 4, /* msg_length */
2614 0, /* eot */
2615 1,
2616 BRW_SAMPLER_SIMD_MODE_SIMD8);
2617
2618 if (shadow)
2619 brw_MOV(p, dst[3], brw_imm_f(1.0));
2620 }
2621
2622
2623 /**
2624 * Resolve subroutine calls after code emit is done.
2625 */
2626 static void post_wm_emit( struct brw_wm_compile *c )
2627 {
2628 brw_resolve_cals(&c->func);
2629 }
2630
2631 static void
2632 get_argument_regs(struct brw_wm_compile *c,
2633 const struct prog_instruction *inst,
2634 int index,
2635 struct brw_reg *regs,
2636 int mask)
2637 {
2638 int i;
2639
2640 for (i = 0; i < 4; i++) {
2641 if (mask & (1 << i))
2642 regs[i] = get_src_reg(c, inst, index, i);
2643 }
2644 }
2645
2646 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2647 {
2648 #define MAX_IF_DEPTH 32
2649 #define MAX_LOOP_DEPTH 32
2650 struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
2651 GLuint i, if_depth = 0, loop_depth = 0;
2652 struct brw_compile *p = &c->func;
2653 struct brw_indirect stack_index = brw_indirect(0, 0);
2654
2655 c->out_of_regs = GL_FALSE;
2656
2657 prealloc_reg(c);
2658 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2659 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2660
2661 for (i = 0; i < c->nr_fp_insns; i++) {
2662 const struct prog_instruction *inst = &c->prog_instructions[i];
2663 int dst_flags;
2664 struct brw_reg args[3][4], dst[4];
2665 int j;
2666
2667 c->cur_inst = i;
2668
2669 #if 0
2670 _mesa_printf("Inst %d: ", i);
2671 _mesa_print_instruction(inst);
2672 #endif
2673
2674 /* fetch any constants that this instruction needs */
2675 if (c->fp->use_const_buffer)
2676 fetch_constants(c, inst);
2677
2678 if (inst->Opcode != OPCODE_ARL) {
2679 for (j = 0; j < 4; j++) {
2680 if (inst->DstReg.WriteMask & (1 << j))
2681 dst[j] = get_dst_reg(c, inst, j);
2682 else
2683 dst[j] = brw_null_reg();
2684 }
2685 }
2686 for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
2687 get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
2688
2689 dst_flags = inst->DstReg.WriteMask;
2690 if (inst->SaturateMode == SATURATE_ZERO_ONE)
2691 dst_flags |= SATURATE;
2692
2693 if (inst->CondUpdate)
2694 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2695 else
2696 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2697
2698 dst_flags = inst->DstReg.WriteMask;
2699 if (inst->SaturateMode == SATURATE_ZERO_ONE)
2700 dst_flags |= SATURATE;
2701
2702 switch (inst->Opcode) {
2703 case WM_PIXELXY:
2704 emit_pixel_xy(c, inst);
2705 break;
2706 case WM_DELTAXY:
2707 emit_delta_xy(c, inst);
2708 break;
2709 case WM_PIXELW:
2710 emit_pixel_w(c, inst);
2711 break;
2712 case WM_LINTERP:
2713 emit_linterp(c, inst);
2714 break;
2715 case WM_PINTERP:
2716 emit_pinterp(c, inst);
2717 break;
2718 case WM_CINTERP:
2719 emit_cinterp(c, inst);
2720 break;
2721 case WM_WPOSXY:
2722 emit_wpos_xy(c, inst);
2723 break;
2724 case WM_FB_WRITE:
2725 emit_fb_write(c, inst);
2726 break;
2727 case WM_FRONTFACING:
2728 emit_frontfacing(c, inst);
2729 break;
2730 case OPCODE_ADD:
2731 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
2732 break;
2733 case OPCODE_ARL:
2734 emit_arl(c, inst);
2735 break;
2736 case OPCODE_FRC:
2737 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
2738 break;
2739 case OPCODE_FLR:
2740 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
2741 break;
2742 case OPCODE_LRP:
2743 emit_lrp(c, inst);
2744 break;
2745 case OPCODE_TRUNC:
2746 emit_trunc(c, inst);
2747 break;
2748 case OPCODE_MOV:
2749 case OPCODE_SWZ:
2750 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
2751 break;
2752 case OPCODE_DP3:
2753 emit_dp3(c, inst);
2754 break;
2755 case OPCODE_DP4:
2756 emit_dp4(c, inst);
2757 break;
2758 case OPCODE_XPD:
2759 emit_xpd(c, inst);
2760 break;
2761 case OPCODE_DPH:
2762 emit_dph(c, inst);
2763 break;
2764 case OPCODE_RCP:
2765 emit_rcp(c, inst);
2766 break;
2767 case OPCODE_RSQ:
2768 emit_rsq(c, inst);
2769 break;
2770 case OPCODE_SIN:
2771 emit_sin(c, inst);
2772 break;
2773 case OPCODE_COS:
2774 emit_cos(c, inst);
2775 break;
2776 case OPCODE_EX2:
2777 emit_ex2(c, inst);
2778 break;
2779 case OPCODE_LG2:
2780 emit_lg2(c, inst);
2781 break;
2782 case OPCODE_MIN:
2783 case OPCODE_MAX:
2784 emit_min_max(c, inst);
2785 break;
2786 case OPCODE_DDX:
2787 case OPCODE_DDY:
2788 emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
2789 args[0]);
2790 break;
2791 case OPCODE_SLT:
2792 emit_slt(c, inst);
2793 break;
2794 case OPCODE_SLE:
2795 emit_sle(c, inst);
2796 break;
2797 case OPCODE_SGT:
2798 emit_sgt(c, inst);
2799 break;
2800 case OPCODE_SGE:
2801 emit_sge(c, inst);
2802 break;
2803 case OPCODE_SEQ:
2804 emit_seq(c, inst);
2805 break;
2806 case OPCODE_SNE:
2807 emit_sne(c, inst);
2808 break;
2809 case OPCODE_MUL:
2810 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
2811 break;
2812 case OPCODE_POW:
2813 emit_pow(c, inst);
2814 break;
2815 case OPCODE_MAD:
2816 emit_mad(c, inst);
2817 break;
2818 case OPCODE_NOISE1:
2819 emit_noise1(c, inst);
2820 break;
2821 case OPCODE_NOISE2:
2822 emit_noise2(c, inst);
2823 break;
2824 case OPCODE_NOISE3:
2825 emit_noise3(c, inst);
2826 break;
2827 case OPCODE_NOISE4:
2828 emit_noise4(c, inst);
2829 break;
2830 case OPCODE_TEX:
2831 emit_tex(c, inst);
2832 break;
2833 case OPCODE_TXB:
2834 emit_txb(c, inst);
2835 break;
2836 case OPCODE_KIL_NV:
2837 emit_kil(c);
2838 break;
2839 case OPCODE_IF:
2840 assert(if_depth < MAX_IF_DEPTH);
2841 if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
2842 break;
2843 case OPCODE_ELSE:
2844 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
2845 break;
2846 case OPCODE_ENDIF:
2847 assert(if_depth > 0);
2848 brw_ENDIF(p, if_inst[--if_depth]);
2849 break;
2850 case OPCODE_BGNSUB:
2851 brw_save_label(p, inst->Comment, p->nr_insn);
2852 break;
2853 case OPCODE_ENDSUB:
2854 /* no-op */
2855 break;
2856 case OPCODE_CAL:
2857 brw_push_insn_state(p);
2858 brw_set_mask_control(p, BRW_MASK_DISABLE);
2859 brw_set_access_mode(p, BRW_ALIGN_1);
2860 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2861 brw_set_access_mode(p, BRW_ALIGN_16);
2862 brw_ADD(p, get_addr_reg(stack_index),
2863 get_addr_reg(stack_index), brw_imm_d(4));
2864 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2865 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2866 brw_pop_insn_state(p);
2867 break;
2868
2869 case OPCODE_RET:
2870 brw_push_insn_state(p);
2871 brw_set_mask_control(p, BRW_MASK_DISABLE);
2872 brw_ADD(p, get_addr_reg(stack_index),
2873 get_addr_reg(stack_index), brw_imm_d(-4));
2874 brw_set_access_mode(p, BRW_ALIGN_1);
2875 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2876 brw_set_access_mode(p, BRW_ALIGN_16);
2877 brw_pop_insn_state(p);
2878
2879 break;
2880 case OPCODE_BGNLOOP:
2881 /* XXX may need to invalidate the current_constant regs */
2882 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2883 break;
2884 case OPCODE_BRK:
2885 brw_BREAK(p);
2886 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2887 break;
2888 case OPCODE_CONT:
2889 brw_CONT(p);
2890 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2891 break;
2892 case OPCODE_ENDLOOP:
2893 {
2894 struct brw_instruction *inst0, *inst1;
2895 GLuint br = 1;
2896
2897 if (BRW_IS_IGDNG(brw))
2898 br = 2;
2899
2900 loop_depth--;
2901 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2902 /* patch all the BREAK/CONT instructions from last BGNLOOP */
2903 while (inst0 > loop_inst[loop_depth]) {
2904 inst0--;
2905 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2906 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2907 inst0->bits3.if_else.pop_count = 0;
2908 }
2909 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2910 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2911 inst0->bits3.if_else.pop_count = 0;
2912 }
2913 }
2914 }
2915 break;
2916 default:
2917 _mesa_printf("unsupported IR in fragment shader %d\n",
2918 inst->Opcode);
2919 }
2920
2921 if (inst->CondUpdate)
2922 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2923 else
2924 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2925 }
2926 post_wm_emit(c);
2927
2928 if (INTEL_DEBUG & DEBUG_WM) {
2929 _mesa_printf("wm-native:\n");
2930 for (i = 0; i < p->nr_insn; i++)
2931 brw_disasm(stderr, &p->store[i]);
2932 _mesa_printf("\n");
2933 }
2934 }
2935
2936 /**
2937 * Do GPU code generation for shaders that use GLSL features such as
2938 * flow control. Other shaders will be compiled with the
2939 */
2940 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2941 {
2942 if (INTEL_DEBUG & DEBUG_WM) {
2943 _mesa_printf("brw_wm_glsl_emit:\n");
2944 }
2945
2946 /* initial instruction translation/simplification */
2947 brw_wm_pass_fp(c);
2948
2949 /* actual code generation */
2950 brw_wm_emit_glsl(brw, c);
2951
2952 if (INTEL_DEBUG & DEBUG_WM) {
2953 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2954 }
2955
2956 c->prog_data.total_grf = num_grf_used(c);
2957 c->prog_data.total_scratch = 0;
2958 }