i965: Fix glFrontFacing in twoside GLSL demo.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11
12 /**
13 * Determine if the given fragment program uses GLSL features such
14 * as flow conditionals, loops, subroutines.
15 * Some GLSL shaders may use these features, others might not.
16 */
17 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
18 {
19 int i;
20 for (i = 0; i < fp->Base.NumInstructions; i++) {
21 const struct prog_instruction *inst = &fp->Base.Instructions[i];
22 switch (inst->Opcode) {
23 case OPCODE_IF:
24 case OPCODE_TRUNC:
25 case OPCODE_ENDIF:
26 case OPCODE_CAL:
27 case OPCODE_BRK:
28 case OPCODE_RET:
29 case OPCODE_DDX:
30 case OPCODE_DDY:
31 case OPCODE_NOISE1:
32 case OPCODE_NOISE2:
33 case OPCODE_NOISE3:
34 case OPCODE_NOISE4:
35 case OPCODE_BGNLOOP:
36 return GL_TRUE;
37 default:
38 break;
39 }
40 }
41 return GL_FALSE;
42 }
43
44
45 /**
46 * Record the mapping of a Mesa register to a hardware register.
47 */
48 static void set_reg(struct brw_wm_compile *c, int file, int index,
49 int component, struct brw_reg reg)
50 {
51 c->wm_regs[file][index][component].reg = reg;
52 c->wm_regs[file][index][component].inited = GL_TRUE;
53 }
54
55 /**
56 * Examine instruction's write mask to find index of first component
57 * enabled for writing.
58 */
59 static int get_scalar_dst_index(struct prog_instruction *inst)
60 {
61 int i;
62 for (i = 0; i < 4; i++)
63 if (inst->DstReg.WriteMask & (1<<i))
64 break;
65 return i;
66 }
67
68 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
69 {
70 struct brw_reg reg;
71 if(c->tmp_index == c->tmp_max)
72 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
73
74 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
75 return reg;
76 }
77
78 /**
79 * Save current temp register info.
80 * There must be a matching call to release_tmps().
81 */
82 static int mark_tmps(struct brw_wm_compile *c)
83 {
84 return c->tmp_index;
85 }
86
87 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
88 {
89 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
90 }
91
92 static void release_tmps(struct brw_wm_compile *c, int mark)
93 {
94 c->tmp_index = mark;
95 }
96
97 /**
98 * Convert Mesa src register to brw register.
99 *
100 * Since we're running in SOA mode each Mesa register corresponds to four
101 * hardware registers. We allocate the hardware registers as needed here.
102 *
103 * \param file register file, one of PROGRAM_x
104 * \param index register number
105 * \param component src component (X=0, Y=1, Z=2, W=3)
106 * \param nr not used?!?
107 * \param neg negate value?
108 * \param abs take absolute value?
109 */
110 static struct brw_reg
111 get_reg(struct brw_wm_compile *c, int file, int index, int component,
112 int nr, GLuint neg, GLuint abs)
113 {
114 struct brw_reg reg;
115 switch (file) {
116 case PROGRAM_STATE_VAR:
117 case PROGRAM_CONSTANT:
118 case PROGRAM_UNIFORM:
119 file = PROGRAM_STATE_VAR;
120 break;
121 case PROGRAM_UNDEFINED:
122 return brw_null_reg();
123 case PROGRAM_TEMPORARY:
124 case PROGRAM_INPUT:
125 case PROGRAM_OUTPUT:
126 case PROGRAM_PAYLOAD:
127 break;
128 default:
129 _mesa_problem(NULL, "Unexpected file in get_reg()");
130 return brw_null_reg();
131 }
132
133 /* see if we've already allocated a HW register for this Mesa register */
134 if (c->wm_regs[file][index][component].inited) {
135 /* yes, re-use */
136 reg = c->wm_regs[file][index][component].reg;
137 }
138 else {
139 /* no, allocate new register */
140 reg = brw_vec8_grf(c->reg_index, 0);
141 }
142
143 /* if this is a new register allocation, record it in the table */
144 if (!c->wm_regs[file][index][component].inited) {
145 set_reg(c, file, index, component, reg);
146 c->reg_index++;
147 }
148
149 if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
150 /* ran out of temporary registers! */
151 #if 1
152 /* This is a big hack for now.
153 * Return bad register index, just don't hang the GPU.
154 */
155 _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
156 c->reg_index = BRW_WM_MAX_GRF - 13;
157 #else
158 return brw_null_reg();
159 #endif
160 }
161
162 if (neg & (1 << component)) {
163 reg = negate(reg);
164 }
165 if (abs)
166 reg = brw_abs(reg);
167 return reg;
168 }
169
170
171 /**
172 * Preallocate registers. This sets up the Mesa to hardware register
173 * mapping for certain registers, such as constants (uniforms/state vars)
174 * and shader inputs.
175 */
176 static void prealloc_reg(struct brw_wm_compile *c)
177 {
178 int i, j;
179 struct brw_reg reg;
180 int nr_interp_regs = 0;
181 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
182
183 for (i = 0; i < 4; i++) {
184 if (i < c->key.nr_depth_regs)
185 reg = brw_vec8_grf(i * 2, 0);
186 else
187 reg = brw_vec8_grf(0, 0);
188 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
189 }
190 c->reg_index += 2 * c->key.nr_depth_regs;
191
192 /* constants */
193 {
194 const int nr_params = c->fp->program.Base.Parameters->NumParameters;
195 const struct gl_program_parameter_list *plist =
196 c->fp->program.Base.Parameters;
197 int index = 0;
198
199 /* number of float constants */
200 c->prog_data.nr_params = 4 * nr_params;
201
202 /* loop over program constants (float[4]) */
203 for (i = 0; i < nr_params; i++) {
204 /* loop over XYZW channels */
205 for (j = 0; j < 4; j++, index++) {
206 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
207 /* Save pointer to parameter/constant value.
208 * Constants will be copied in prepare_constant_buffer()
209 */
210 c->prog_data.param[index] = &plist->ParameterValues[i][j];
211 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
212 }
213 }
214 /* number of constant regs used (each reg is float[8]) */
215 c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
216 c->reg_index += c->nr_creg;
217 }
218
219 /* fragment shader inputs */
220 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
221 if (inputs & (1<<i)) {
222 nr_interp_regs++;
223 reg = brw_vec8_grf(c->reg_index, 0);
224 for (j = 0; j < 4; j++)
225 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
226 c->reg_index += 2;
227 }
228 }
229
230 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
231 c->prog_data.urb_read_length = nr_interp_regs * 2;
232 c->prog_data.curb_read_length = c->nr_creg;
233 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
234 c->reg_index++;
235 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
236 c->reg_index += 2;
237 }
238
239
240 /**
241 * Convert Mesa dst register to brw register.
242 */
243 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
244 struct prog_instruction *inst, int component, int nr)
245 {
246 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
247 0, 0);
248 }
249
250
251 /**
252 * Convert Mesa src register to brw register.
253 */
254 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
255 struct prog_src_register *src, int index, int nr)
256 {
257 int component = GET_SWZ(src->Swizzle, index);
258 return get_reg(c, src->File, src->Index, component, nr,
259 src->NegateBase, src->Abs);
260 }
261
262 /**
263 * Subroutines are minimal support for resusable instruction sequences.
264 * They are implemented as simply as possible to minimise overhead: there
265 * is no explicit support for communication between the caller and callee
266 * other than saving the return address in a temporary register, nor is
267 * there any automatic local storage. This implies that great care is
268 * required before attempting reentrancy or any kind of nested
269 * subroutine invocations.
270 */
271 static void invoke_subroutine( struct brw_wm_compile *c,
272 enum _subroutine subroutine,
273 void (*emit)( struct brw_wm_compile * ) )
274 {
275 struct brw_compile *p = &c->func;
276
277 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
278
279 if( c->subroutines[ subroutine ] ) {
280 /* subroutine previously emitted: reuse existing instructions */
281
282 int mark = mark_tmps( c );
283 struct brw_reg return_address = retype( alloc_tmp( c ),
284 BRW_REGISTER_TYPE_UD );
285 int here = p->nr_insn;
286
287 brw_push_insn_state(p);
288 brw_set_mask_control(p, BRW_MASK_DISABLE);
289 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
290
291 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
292 brw_imm_d( ( c->subroutines[ subroutine ] -
293 here - 1 ) << 4 ) );
294 brw_pop_insn_state(p);
295
296 release_tmps( c, mark );
297 } else {
298 /* previously unused subroutine: emit, and mark for later reuse */
299
300 int mark = mark_tmps( c );
301 struct brw_reg return_address = retype( alloc_tmp( c ),
302 BRW_REGISTER_TYPE_UD );
303 struct brw_instruction *calc;
304 int base = p->nr_insn;
305
306 brw_push_insn_state(p);
307 brw_set_mask_control(p, BRW_MASK_DISABLE);
308 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
309 brw_pop_insn_state(p);
310
311 c->subroutines[ subroutine ] = p->nr_insn;
312
313 emit( c );
314
315 brw_push_insn_state(p);
316 brw_set_mask_control(p, BRW_MASK_DISABLE);
317 brw_MOV( p, brw_ip_reg(), return_address );
318 brw_pop_insn_state(p);
319
320 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
321
322 release_tmps( c, mark );
323 }
324 }
325
326 static void emit_abs( struct brw_wm_compile *c,
327 struct prog_instruction *inst)
328 {
329 int i;
330 struct brw_compile *p = &c->func;
331 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
332 for (i = 0; i < 4; i++) {
333 if (inst->DstReg.WriteMask & (1<<i)) {
334 struct brw_reg src, dst;
335 dst = get_dst_reg(c, inst, i, 1);
336 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
337 brw_MOV(p, dst, brw_abs(src));
338 }
339 }
340 brw_set_saturate(p, 0);
341 }
342
343 static void emit_trunc( struct brw_wm_compile *c,
344 struct prog_instruction *inst)
345 {
346 int i;
347 struct brw_compile *p = &c->func;
348 GLuint mask = inst->DstReg.WriteMask;
349 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
350 for (i = 0; i < 4; i++) {
351 if (mask & (1<<i)) {
352 struct brw_reg src, dst;
353 dst = get_dst_reg(c, inst, i, 1) ;
354 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
355 brw_RNDZ(p, dst, src);
356 }
357 }
358 brw_set_saturate(p, 0);
359 }
360
361 static void emit_mov( struct brw_wm_compile *c,
362 struct prog_instruction *inst)
363 {
364 int i;
365 struct brw_compile *p = &c->func;
366 GLuint mask = inst->DstReg.WriteMask;
367 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
368 for (i = 0; i < 4; i++) {
369 if (mask & (1<<i)) {
370 struct brw_reg src, dst;
371 dst = get_dst_reg(c, inst, i, 1);
372 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
373 brw_MOV(p, dst, src);
374 }
375 }
376 brw_set_saturate(p, 0);
377 }
378
379 static void emit_pixel_xy(struct brw_wm_compile *c,
380 struct prog_instruction *inst)
381 {
382 struct brw_reg r1 = brw_vec1_grf(1, 0);
383 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
384
385 struct brw_reg dst0, dst1;
386 struct brw_compile *p = &c->func;
387 GLuint mask = inst->DstReg.WriteMask;
388
389 dst0 = get_dst_reg(c, inst, 0, 1);
390 dst1 = get_dst_reg(c, inst, 1, 1);
391 /* Calculate pixel centers by adding 1 or 0 to each of the
392 * micro-tile coordinates passed in r1.
393 */
394 if (mask & WRITEMASK_X) {
395 brw_ADD(p,
396 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
397 stride(suboffset(r1_uw, 4), 2, 4, 0),
398 brw_imm_v(0x10101010));
399 }
400
401 if (mask & WRITEMASK_Y) {
402 brw_ADD(p,
403 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
404 stride(suboffset(r1_uw, 5), 2, 4, 0),
405 brw_imm_v(0x11001100));
406 }
407 }
408
409 static void emit_delta_xy(struct brw_wm_compile *c,
410 struct prog_instruction *inst)
411 {
412 struct brw_reg r1 = brw_vec1_grf(1, 0);
413 struct brw_reg dst0, dst1, src0, src1;
414 struct brw_compile *p = &c->func;
415 GLuint mask = inst->DstReg.WriteMask;
416
417 dst0 = get_dst_reg(c, inst, 0, 1);
418 dst1 = get_dst_reg(c, inst, 1, 1);
419 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
420 src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
421 /* Calc delta X,Y by subtracting origin in r1 from the pixel
422 * centers.
423 */
424 if (mask & WRITEMASK_X) {
425 brw_ADD(p,
426 dst0,
427 retype(src0, BRW_REGISTER_TYPE_UW),
428 negate(r1));
429 }
430
431 if (mask & WRITEMASK_Y) {
432 brw_ADD(p,
433 dst1,
434 retype(src1, BRW_REGISTER_TYPE_UW),
435 negate(suboffset(r1,1)));
436
437 }
438 }
439
440 static void fire_fb_write( struct brw_wm_compile *c,
441 GLuint base_reg,
442 GLuint nr,
443 GLuint target,
444 GLuint eot)
445 {
446 struct brw_compile *p = &c->func;
447 /* Pass through control information:
448 */
449 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
450 {
451 brw_push_insn_state(p);
452 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
453 brw_MOV(p,
454 brw_message_reg(base_reg + 1),
455 brw_vec8_grf(1, 0));
456 brw_pop_insn_state(p);
457 }
458 /* Send framebuffer write message: */
459 brw_fb_WRITE(p,
460 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
461 base_reg,
462 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
463 target,
464 nr,
465 0,
466 eot);
467 }
468
469 static void emit_fb_write(struct brw_wm_compile *c,
470 struct prog_instruction *inst)
471 {
472 struct brw_compile *p = &c->func;
473 int nr = 2;
474 int channel;
475 GLuint target, eot;
476 struct brw_reg src0;
477
478 /* Reserve a space for AA - may not be needed:
479 */
480 if (c->key.aa_dest_stencil_reg)
481 nr += 1;
482
483 brw_push_insn_state(p);
484 for (channel = 0; channel < 4; channel++) {
485 src0 = get_src_reg(c, &inst->SrcReg[0], channel, 1);
486 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
487 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
488 brw_MOV(p, brw_message_reg(nr + channel), src0);
489 }
490 /* skip over the regs populated above: */
491 nr += 8;
492 brw_pop_insn_state(p);
493
494 if (c->key.source_depth_to_render_target) {
495 if (c->key.computes_depth) {
496 src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
497 brw_MOV(p, brw_message_reg(nr), src0);
498 }
499 else {
500 src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
501 brw_MOV(p, brw_message_reg(nr), src0);
502 }
503
504 nr += 2;
505 }
506
507 if (c->key.dest_depth_reg) {
508 GLuint comp = c->key.dest_depth_reg / 2;
509 GLuint off = c->key.dest_depth_reg % 2;
510
511 assert(comp == 1);
512 assert(off == 0);
513 #if 0
514 /* XXX do we need this code? comp always 1, off always 0, it seems */
515 if (off != 0) {
516 brw_push_insn_state(p);
517 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
518
519 brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
520 /* 2nd half? */
521 brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
522 brw_pop_insn_state(p);
523 }
524 else
525 #endif
526 {
527 struct brw_reg src = get_src_reg(c, &inst->SrcReg[1], 1, 1);
528 brw_MOV(p, brw_message_reg(nr), src);
529 }
530 nr += 2;
531 }
532
533 target = inst->Aux >> 1;
534 eot = inst->Aux & 1;
535 fire_fb_write(c, 0, nr, target, eot);
536 }
537
538 static void emit_pixel_w( struct brw_wm_compile *c,
539 struct prog_instruction *inst)
540 {
541 struct brw_compile *p = &c->func;
542 GLuint mask = inst->DstReg.WriteMask;
543 if (mask & WRITEMASK_W) {
544 struct brw_reg dst, src0, delta0, delta1;
545 struct brw_reg interp3;
546
547 dst = get_dst_reg(c, inst, 3, 1);
548 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
549 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
550 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
551
552 interp3 = brw_vec1_grf(src0.nr+1, 4);
553 /* Calc 1/w - just linterp wpos[3] optimized by putting the
554 * result straight into a message reg.
555 */
556 brw_LINE(p, brw_null_reg(), interp3, delta0);
557 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
558
559 /* Calc w */
560 brw_math_16( p, dst,
561 BRW_MATH_FUNCTION_INV,
562 BRW_MATH_SATURATE_NONE,
563 2, brw_null_reg(),
564 BRW_MATH_PRECISION_FULL);
565 }
566 }
567
568 static void emit_linterp(struct brw_wm_compile *c,
569 struct prog_instruction *inst)
570 {
571 struct brw_compile *p = &c->func;
572 GLuint mask = inst->DstReg.WriteMask;
573 struct brw_reg interp[4];
574 struct brw_reg dst, delta0, delta1;
575 struct brw_reg src0;
576 GLuint nr, i;
577
578 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
579 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
580 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
581 nr = src0.nr;
582
583 interp[0] = brw_vec1_grf(nr, 0);
584 interp[1] = brw_vec1_grf(nr, 4);
585 interp[2] = brw_vec1_grf(nr+1, 0);
586 interp[3] = brw_vec1_grf(nr+1, 4);
587
588 for(i = 0; i < 4; i++ ) {
589 if (mask & (1<<i)) {
590 dst = get_dst_reg(c, inst, i, 1);
591 brw_LINE(p, brw_null_reg(), interp[i], delta0);
592 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
593 }
594 }
595 }
596
597 static void emit_cinterp(struct brw_wm_compile *c,
598 struct prog_instruction *inst)
599 {
600 struct brw_compile *p = &c->func;
601 GLuint mask = inst->DstReg.WriteMask;
602
603 struct brw_reg interp[4];
604 struct brw_reg dst, src0;
605 GLuint nr, i;
606
607 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
608 nr = src0.nr;
609
610 interp[0] = brw_vec1_grf(nr, 0);
611 interp[1] = brw_vec1_grf(nr, 4);
612 interp[2] = brw_vec1_grf(nr+1, 0);
613 interp[3] = brw_vec1_grf(nr+1, 4);
614
615 for(i = 0; i < 4; i++ ) {
616 if (mask & (1<<i)) {
617 dst = get_dst_reg(c, inst, i, 1);
618 brw_MOV(p, dst, suboffset(interp[i],3));
619 }
620 }
621 }
622
623 static void emit_pinterp(struct brw_wm_compile *c,
624 struct prog_instruction *inst)
625 {
626 struct brw_compile *p = &c->func;
627 GLuint mask = inst->DstReg.WriteMask;
628
629 struct brw_reg interp[4];
630 struct brw_reg dst, delta0, delta1;
631 struct brw_reg src0, w;
632 GLuint nr, i;
633
634 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
635 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
636 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
637 w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
638 nr = src0.nr;
639
640 interp[0] = brw_vec1_grf(nr, 0);
641 interp[1] = brw_vec1_grf(nr, 4);
642 interp[2] = brw_vec1_grf(nr+1, 0);
643 interp[3] = brw_vec1_grf(nr+1, 4);
644
645 for(i = 0; i < 4; i++ ) {
646 if (mask & (1<<i)) {
647 dst = get_dst_reg(c, inst, i, 1);
648 brw_LINE(p, brw_null_reg(), interp[i], delta0);
649 brw_MAC(p, dst, suboffset(interp[i],1),
650 delta1);
651 brw_MUL(p, dst, dst, w);
652 }
653 }
654 }
655
656 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
657 static void emit_frontfacing(struct brw_wm_compile *c,
658 struct prog_instruction *inst)
659 {
660 struct brw_compile *p = &c->func;
661 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
662 struct brw_reg dst;
663 GLuint mask = inst->DstReg.WriteMask;
664 int i;
665
666 for (i = 0; i < 4; i++) {
667 if (mask & (1<<i)) {
668 dst = get_dst_reg(c, inst, i, 1);
669 brw_MOV(p, dst, brw_imm_f(0.0));
670 }
671 }
672
673 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
674 * us front face
675 */
676 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
677 for (i = 0; i < 4; i++) {
678 if (mask & (1<<i)) {
679 dst = get_dst_reg(c, inst, i, 1);
680 brw_MOV(p, dst, brw_imm_f(1.0));
681 }
682 }
683 brw_set_predicate_control_flag_value(p, 0xff);
684 }
685
686 static void emit_xpd(struct brw_wm_compile *c,
687 struct prog_instruction *inst)
688 {
689 int i;
690 struct brw_compile *p = &c->func;
691 GLuint mask = inst->DstReg.WriteMask;
692 for (i = 0; i < 4; i++) {
693 GLuint i2 = (i+2)%3;
694 GLuint i1 = (i+1)%3;
695 if (mask & (1<<i)) {
696 struct brw_reg src0, src1, dst;
697 dst = get_dst_reg(c, inst, i, 1);
698 src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
699 src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
700 brw_MUL(p, brw_null_reg(), src0, src1);
701 src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
702 src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
703 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
704 brw_MAC(p, dst, src0, src1);
705 brw_set_saturate(p, 0);
706 }
707 }
708 brw_set_saturate(p, 0);
709 }
710
711 static void emit_dp3(struct brw_wm_compile *c,
712 struct prog_instruction *inst)
713 {
714 struct brw_reg src0[3], src1[3], dst;
715 int i;
716 struct brw_compile *p = &c->func;
717 for (i = 0; i < 3; i++) {
718 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
719 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
720 }
721
722 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
723 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
724 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
725 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
726 brw_MAC(p, dst, src0[2], src1[2]);
727 brw_set_saturate(p, 0);
728 }
729
730 static void emit_dp4(struct brw_wm_compile *c,
731 struct prog_instruction *inst)
732 {
733 struct brw_reg src0[4], src1[4], dst;
734 int i;
735 struct brw_compile *p = &c->func;
736 for (i = 0; i < 4; i++) {
737 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
738 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
739 }
740 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
741 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
742 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
743 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
744 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
745 brw_MAC(p, dst, src0[3], src1[3]);
746 brw_set_saturate(p, 0);
747 }
748
749 static void emit_dph(struct brw_wm_compile *c,
750 struct prog_instruction *inst)
751 {
752 struct brw_reg src0[4], src1[4], dst;
753 int i;
754 struct brw_compile *p = &c->func;
755 for (i = 0; i < 4; i++) {
756 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
757 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
758 }
759 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
760 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
761 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
762 brw_MAC(p, dst, src0[2], src1[2]);
763 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
764 brw_ADD(p, dst, dst, src1[3]);
765 brw_set_saturate(p, 0);
766 }
767
768 /**
769 * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
770 * Note that the result of the function is smeared across the dest
771 * register's X, Y, Z and W channels (subject to writemasking of course).
772 */
773 static void emit_math1(struct brw_wm_compile *c,
774 struct prog_instruction *inst, GLuint func)
775 {
776 struct brw_compile *p = &c->func;
777 struct brw_reg src0, dst, tmp;
778 const int mark = mark_tmps( c );
779 int i;
780
781 tmp = alloc_tmp(c);
782
783 /* Get first component of source register */
784 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
785
786 /* tmp = func(src0) */
787 brw_MOV(p, brw_message_reg(2), src0);
788 brw_math(p,
789 tmp,
790 func,
791 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
792 2,
793 brw_null_reg(),
794 BRW_MATH_DATA_VECTOR,
795 BRW_MATH_PRECISION_FULL);
796
797 /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
798
799 /* replicate tmp value across enabled dest channels */
800 for (i = 0; i < 4; i++) {
801 if (inst->DstReg.WriteMask & (1 << i)) {
802 dst = get_dst_reg(c, inst, i, 1);
803 brw_MOV(p, dst, tmp);
804 }
805 }
806
807 release_tmps(c, mark);
808 }
809
810 static void emit_rcp(struct brw_wm_compile *c,
811 struct prog_instruction *inst)
812 {
813 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
814 }
815
816 static void emit_rsq(struct brw_wm_compile *c,
817 struct prog_instruction *inst)
818 {
819 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
820 }
821
822 static void emit_sin(struct brw_wm_compile *c,
823 struct prog_instruction *inst)
824 {
825 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
826 }
827
828 static void emit_cos(struct brw_wm_compile *c,
829 struct prog_instruction *inst)
830 {
831 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
832 }
833
834 static void emit_ex2(struct brw_wm_compile *c,
835 struct prog_instruction *inst)
836 {
837 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
838 }
839
840 static void emit_lg2(struct brw_wm_compile *c,
841 struct prog_instruction *inst)
842 {
843 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
844 }
845
846 static void emit_add(struct brw_wm_compile *c,
847 struct prog_instruction *inst)
848 {
849 struct brw_compile *p = &c->func;
850 struct brw_reg src0, src1, dst;
851 GLuint mask = inst->DstReg.WriteMask;
852 int i;
853 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
854 for (i = 0 ; i < 4; i++) {
855 if (mask & (1<<i)) {
856 dst = get_dst_reg(c, inst, i, 1);
857 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
858 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
859 brw_ADD(p, dst, src0, src1);
860 }
861 }
862 brw_set_saturate(p, 0);
863 }
864
865 static void emit_sub(struct brw_wm_compile *c,
866 struct prog_instruction *inst)
867 {
868 struct brw_compile *p = &c->func;
869 struct brw_reg src0, src1, dst;
870 GLuint mask = inst->DstReg.WriteMask;
871 int i;
872 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
873 for (i = 0 ; i < 4; i++) {
874 if (mask & (1<<i)) {
875 dst = get_dst_reg(c, inst, i, 1);
876 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
877 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
878 brw_ADD(p, dst, src0, negate(src1));
879 }
880 }
881 brw_set_saturate(p, 0);
882 }
883
884 static void emit_mul(struct brw_wm_compile *c,
885 struct prog_instruction *inst)
886 {
887 struct brw_compile *p = &c->func;
888 struct brw_reg src0, src1, dst;
889 GLuint mask = inst->DstReg.WriteMask;
890 int i;
891 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
892 for (i = 0 ; i < 4; i++) {
893 if (mask & (1<<i)) {
894 dst = get_dst_reg(c, inst, i, 1);
895 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
896 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
897 brw_MUL(p, dst, src0, src1);
898 }
899 }
900 brw_set_saturate(p, 0);
901 }
902
903 static void emit_frc(struct brw_wm_compile *c,
904 struct prog_instruction *inst)
905 {
906 struct brw_compile *p = &c->func;
907 struct brw_reg src0, dst;
908 GLuint mask = inst->DstReg.WriteMask;
909 int i;
910 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
911 for (i = 0 ; i < 4; i++) {
912 if (mask & (1<<i)) {
913 dst = get_dst_reg(c, inst, i, 1);
914 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
915 brw_FRC(p, dst, src0);
916 }
917 }
918 if (inst->SaturateMode != SATURATE_OFF)
919 brw_set_saturate(p, 0);
920 }
921
922 static void emit_flr(struct brw_wm_compile *c,
923 struct prog_instruction *inst)
924 {
925 struct brw_compile *p = &c->func;
926 struct brw_reg src0, dst;
927 GLuint mask = inst->DstReg.WriteMask;
928 int i;
929 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
930 for (i = 0 ; i < 4; i++) {
931 if (mask & (1<<i)) {
932 dst = get_dst_reg(c, inst, i, 1);
933 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
934 brw_RNDD(p, dst, src0);
935 }
936 }
937 brw_set_saturate(p, 0);
938 }
939
940 static void emit_max(struct brw_wm_compile *c,
941 struct prog_instruction *inst)
942 {
943 struct brw_compile *p = &c->func;
944 GLuint mask = inst->DstReg.WriteMask;
945 struct brw_reg src0, src1, dst;
946 int i;
947 brw_push_insn_state(p);
948 for (i = 0; i < 4; i++) {
949 if (mask & (1<<i)) {
950 dst = get_dst_reg(c, inst, i, 1);
951 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
952 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
953 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
954 brw_MOV(p, dst, src0);
955 brw_set_saturate(p, 0);
956
957 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
958 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
959 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
960 brw_MOV(p, dst, src1);
961 brw_set_saturate(p, 0);
962 brw_set_predicate_control_flag_value(p, 0xff);
963 }
964 }
965 brw_pop_insn_state(p);
966 }
967
968 static void emit_min(struct brw_wm_compile *c,
969 struct prog_instruction *inst)
970 {
971 struct brw_compile *p = &c->func;
972 GLuint mask = inst->DstReg.WriteMask;
973 struct brw_reg src0, src1, dst;
974 int i;
975 brw_push_insn_state(p);
976 for (i = 0; i < 4; i++) {
977 if (mask & (1<<i)) {
978 dst = get_dst_reg(c, inst, i, 1);
979 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
980 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
981 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
982 brw_MOV(p, dst, src0);
983 brw_set_saturate(p, 0);
984
985 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
986 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
987 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
988 brw_MOV(p, dst, src1);
989 brw_set_saturate(p, 0);
990 brw_set_predicate_control_flag_value(p, 0xff);
991 }
992 }
993 brw_pop_insn_state(p);
994 }
995
996 static void emit_pow(struct brw_wm_compile *c,
997 struct prog_instruction *inst)
998 {
999 struct brw_compile *p = &c->func;
1000 struct brw_reg dst, src0, src1;
1001 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
1002 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1003 src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
1004
1005 brw_MOV(p, brw_message_reg(2), src0);
1006 brw_MOV(p, brw_message_reg(3), src1);
1007
1008 brw_math(p,
1009 dst,
1010 BRW_MATH_FUNCTION_POW,
1011 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
1012 2,
1013 brw_null_reg(),
1014 BRW_MATH_DATA_VECTOR,
1015 BRW_MATH_PRECISION_FULL);
1016 }
1017
1018 static void emit_lrp(struct brw_wm_compile *c,
1019 struct prog_instruction *inst)
1020 {
1021 struct brw_compile *p = &c->func;
1022 GLuint mask = inst->DstReg.WriteMask;
1023 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
1024 int i;
1025 int mark = mark_tmps(c);
1026 for (i = 0; i < 4; i++) {
1027 if (mask & (1<<i)) {
1028 dst = get_dst_reg(c, inst, i, 1);
1029 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
1030
1031 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
1032
1033 if (src1.nr == dst.nr) {
1034 tmp1 = alloc_tmp(c);
1035 brw_MOV(p, tmp1, src1);
1036 } else
1037 tmp1 = src1;
1038
1039 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
1040 if (src2.nr == dst.nr) {
1041 tmp2 = alloc_tmp(c);
1042 brw_MOV(p, tmp2, src2);
1043 } else
1044 tmp2 = src2;
1045
1046 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
1047 brw_MUL(p, brw_null_reg(), dst, tmp2);
1048 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1049 brw_MAC(p, dst, src0, tmp1);
1050 brw_set_saturate(p, 0);
1051 }
1052 release_tmps(c, mark);
1053 }
1054 }
1055
1056 /**
1057 * For GLSL shaders, this KIL will be unconditional.
1058 * It may be contained inside an IF/ENDIF structure of course.
1059 */
1060 static void emit_kil(struct brw_wm_compile *c)
1061 {
1062 struct brw_compile *p = &c->func;
1063 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1064 brw_push_insn_state(p);
1065 brw_set_mask_control(p, BRW_MASK_DISABLE);
1066 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
1067 brw_AND(p, depth, c->emit_mask_reg, depth);
1068 brw_pop_insn_state(p);
1069 }
1070
1071 static void emit_mad(struct brw_wm_compile *c,
1072 struct prog_instruction *inst)
1073 {
1074 struct brw_compile *p = &c->func;
1075 GLuint mask = inst->DstReg.WriteMask;
1076 struct brw_reg dst, src0, src1, src2;
1077 int i;
1078
1079 for (i = 0; i < 4; i++) {
1080 if (mask & (1<<i)) {
1081 dst = get_dst_reg(c, inst, i, 1);
1082 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
1083 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
1084 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
1085 brw_MUL(p, dst, src0, src1);
1086
1087 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
1088 brw_ADD(p, dst, dst, src2);
1089 brw_set_saturate(p, 0);
1090 }
1091 }
1092 }
1093
1094 static void emit_sop(struct brw_wm_compile *c,
1095 struct prog_instruction *inst, GLuint cond)
1096 {
1097 struct brw_compile *p = &c->func;
1098 GLuint mask = inst->DstReg.WriteMask;
1099 struct brw_reg dst, src0, src1;
1100 int i;
1101
1102 for (i = 0; i < 4; i++) {
1103 if (mask & (1<<i)) {
1104 dst = get_dst_reg(c, inst, i, 1);
1105 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
1106 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
1107 brw_push_insn_state(p);
1108 brw_CMP(p, brw_null_reg(), cond, src0, src1);
1109 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1110 brw_MOV(p, dst, brw_imm_f(0.0));
1111 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
1112 brw_MOV(p, dst, brw_imm_f(1.0));
1113 brw_pop_insn_state(p);
1114 }
1115 }
1116 }
1117
1118 static void emit_slt(struct brw_wm_compile *c,
1119 struct prog_instruction *inst)
1120 {
1121 emit_sop(c, inst, BRW_CONDITIONAL_L);
1122 }
1123
1124 static void emit_sle(struct brw_wm_compile *c,
1125 struct prog_instruction *inst)
1126 {
1127 emit_sop(c, inst, BRW_CONDITIONAL_LE);
1128 }
1129
1130 static void emit_sgt(struct brw_wm_compile *c,
1131 struct prog_instruction *inst)
1132 {
1133 emit_sop(c, inst, BRW_CONDITIONAL_G);
1134 }
1135
1136 static void emit_sge(struct brw_wm_compile *c,
1137 struct prog_instruction *inst)
1138 {
1139 emit_sop(c, inst, BRW_CONDITIONAL_GE);
1140 }
1141
1142 static void emit_seq(struct brw_wm_compile *c,
1143 struct prog_instruction *inst)
1144 {
1145 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
1146 }
1147
1148 static void emit_sne(struct brw_wm_compile *c,
1149 struct prog_instruction *inst)
1150 {
1151 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
1152 }
1153
1154 static void emit_ddx(struct brw_wm_compile *c,
1155 struct prog_instruction *inst)
1156 {
1157 struct brw_compile *p = &c->func;
1158 GLuint mask = inst->DstReg.WriteMask;
1159 struct brw_reg interp[4];
1160 struct brw_reg dst;
1161 struct brw_reg src0, w;
1162 GLuint nr, i;
1163 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1164 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1165 nr = src0.nr;
1166 interp[0] = brw_vec1_grf(nr, 0);
1167 interp[1] = brw_vec1_grf(nr, 4);
1168 interp[2] = brw_vec1_grf(nr+1, 0);
1169 interp[3] = brw_vec1_grf(nr+1, 4);
1170 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1171 for(i = 0; i < 4; i++ ) {
1172 if (mask & (1<<i)) {
1173 dst = get_dst_reg(c, inst, i, 1);
1174 brw_MOV(p, dst, interp[i]);
1175 brw_MUL(p, dst, dst, w);
1176 }
1177 }
1178 brw_set_saturate(p, 0);
1179 }
1180
1181 static void emit_ddy(struct brw_wm_compile *c,
1182 struct prog_instruction *inst)
1183 {
1184 struct brw_compile *p = &c->func;
1185 GLuint mask = inst->DstReg.WriteMask;
1186 struct brw_reg interp[4];
1187 struct brw_reg dst;
1188 struct brw_reg src0, w;
1189 GLuint nr, i;
1190
1191 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1192 nr = src0.nr;
1193 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1194 interp[0] = brw_vec1_grf(nr, 0);
1195 interp[1] = brw_vec1_grf(nr, 4);
1196 interp[2] = brw_vec1_grf(nr+1, 0);
1197 interp[3] = brw_vec1_grf(nr+1, 4);
1198 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1199 for(i = 0; i < 4; i++ ) {
1200 if (mask & (1<<i)) {
1201 dst = get_dst_reg(c, inst, i, 1);
1202 brw_MOV(p, dst, suboffset(interp[i], 1));
1203 brw_MUL(p, dst, dst, w);
1204 }
1205 }
1206 brw_set_saturate(p, 0);
1207 }
1208
1209 static INLINE struct brw_reg high_words( struct brw_reg reg )
1210 {
1211 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1212 0, 8, 2 );
1213 }
1214
1215 static INLINE struct brw_reg low_words( struct brw_reg reg )
1216 {
1217 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1218 }
1219
1220 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1221 {
1222 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1223 }
1224
1225 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1226 {
1227 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1228 0, 16, 2 );
1229 }
1230
1231 /* One-, two- and three-dimensional Perlin noise, similar to the description
1232 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1233 static void noise1_sub( struct brw_wm_compile *c ) {
1234
1235 struct brw_compile *p = &c->func;
1236 struct brw_reg param,
1237 x0, x1, /* gradients at each end */
1238 t, tmp[ 2 ], /* float temporaries */
1239 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1240 int i;
1241 int mark = mark_tmps( c );
1242
1243 x0 = alloc_tmp( c );
1244 x1 = alloc_tmp( c );
1245 t = alloc_tmp( c );
1246 tmp[ 0 ] = alloc_tmp( c );
1247 tmp[ 1 ] = alloc_tmp( c );
1248 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1249 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1250 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1251 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1252 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1253
1254 param = lookup_tmp( c, mark - 2 );
1255
1256 brw_set_access_mode( p, BRW_ALIGN_1 );
1257
1258 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1259
1260 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1261 be hashed. Also compute the remainder (offset within the unit
1262 length), interleaved to reduce register dependency penalties. */
1263 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1264 brw_FRC( p, param, param );
1265 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1266 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1267 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1268
1269 /* We're now ready to perform the hashing. The two hashes are
1270 interleaved for performance. The hash function used is
1271 designed to rapidly achieve avalanche and require only 32x16
1272 bit multiplication, and 16-bit swizzles (which we get for
1273 free). We can't use immediate operands in the multiplies,
1274 because immediates are permitted only in src1 and the 16-bit
1275 factor is permitted only in src0. */
1276 for( i = 0; i < 2; i++ )
1277 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1278 for( i = 0; i < 2; i++ )
1279 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1280 high_words( itmp[ i ] ) );
1281 for( i = 0; i < 2; i++ )
1282 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1283 for( i = 0; i < 2; i++ )
1284 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1285 high_words( itmp[ i ] ) );
1286 for( i = 0; i < 2; i++ )
1287 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1288 for( i = 0; i < 2; i++ )
1289 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1290 high_words( itmp[ i ] ) );
1291
1292 /* Now we want to initialise the two gradients based on the
1293 hashes. Format conversion from signed integer to float leaves
1294 everything scaled too high by a factor of pow( 2, 31 ), but
1295 we correct for that right at the end. */
1296 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1297 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1298 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1299
1300 brw_MUL( p, x0, x0, param );
1301 brw_MUL( p, x1, x1, t );
1302
1303 /* We interpolate between the gradients using the polynomial
1304 6t^5 - 15t^4 + 10t^3 (Perlin). */
1305 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1306 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1307 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1308 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1309 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1310 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1311 pipeline */
1312 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1313 brw_MUL( p, param, tmp[ 0 ], param );
1314 brw_MUL( p, x1, x1, param );
1315 brw_ADD( p, x0, x0, x1 );
1316 /* scale by pow( 2, -30 ), to compensate for the format conversion
1317 above and an extra factor of 2 so that a single gradient covers
1318 the [-1,1] range */
1319 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1320
1321 release_tmps( c, mark );
1322 }
1323
1324 static void emit_noise1( struct brw_wm_compile *c,
1325 struct prog_instruction *inst )
1326 {
1327 struct brw_compile *p = &c->func;
1328 struct brw_reg src, param, dst;
1329 GLuint mask = inst->DstReg.WriteMask;
1330 int i;
1331 int mark = mark_tmps( c );
1332
1333 assert( mark == 0 );
1334
1335 src = get_src_reg( c, inst->SrcReg, 0, 1 );
1336
1337 param = alloc_tmp( c );
1338
1339 brw_MOV( p, param, src );
1340
1341 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1342
1343 /* Fill in the result: */
1344 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1345 for (i = 0 ; i < 4; i++) {
1346 if (mask & (1<<i)) {
1347 dst = get_dst_reg(c, inst, i, 1);
1348 brw_MOV( p, dst, param );
1349 }
1350 }
1351 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1352 brw_set_saturate( p, 0 );
1353
1354 release_tmps( c, mark );
1355 }
1356
1357 static void noise2_sub( struct brw_wm_compile *c ) {
1358
1359 struct brw_compile *p = &c->func;
1360 struct brw_reg param0, param1,
1361 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1362 t, tmp[ 4 ], /* float temporaries */
1363 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1364 int i;
1365 int mark = mark_tmps( c );
1366
1367 x0y0 = alloc_tmp( c );
1368 x0y1 = alloc_tmp( c );
1369 x1y0 = alloc_tmp( c );
1370 x1y1 = alloc_tmp( c );
1371 t = alloc_tmp( c );
1372 for( i = 0; i < 4; i++ ) {
1373 tmp[ i ] = alloc_tmp( c );
1374 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1375 }
1376 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1377 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1378 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1379
1380 param0 = lookup_tmp( c, mark - 3 );
1381 param1 = lookup_tmp( c, mark - 2 );
1382
1383 brw_set_access_mode( p, BRW_ALIGN_1 );
1384
1385 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1386 be hashed. Also compute the remainders (offsets within the unit
1387 square), interleaved to reduce register dependency penalties. */
1388 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1389 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1390 brw_FRC( p, param0, param0 );
1391 brw_FRC( p, param1, param1 );
1392 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1393 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1394 low_words( itmp[ 1 ] ) );
1395 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1396 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1397 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1398 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1399 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1400
1401 /* We're now ready to perform the hashing. The four hashes are
1402 interleaved for performance. The hash function used is
1403 designed to rapidly achieve avalanche and require only 32x16
1404 bit multiplication, and 16-bit swizzles (which we get for
1405 free). We can't use immediate operands in the multiplies,
1406 because immediates are permitted only in src1 and the 16-bit
1407 factor is permitted only in src0. */
1408 for( i = 0; i < 4; i++ )
1409 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1410 for( i = 0; i < 4; i++ )
1411 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1412 high_words( itmp[ i ] ) );
1413 for( i = 0; i < 4; i++ )
1414 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1415 for( i = 0; i < 4; i++ )
1416 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1417 high_words( itmp[ i ] ) );
1418 for( i = 0; i < 4; i++ )
1419 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1420 for( i = 0; i < 4; i++ )
1421 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1422 high_words( itmp[ i ] ) );
1423
1424 /* Now we want to initialise the four gradients based on the
1425 hashes. Format conversion from signed integer to float leaves
1426 everything scaled too high by a factor of pow( 2, 15 ), but
1427 we correct for that right at the end. */
1428 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1429 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1430 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1431 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1432 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1433
1434 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1435 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1436 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1437 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1438
1439 brw_MUL( p, x1y0, x1y0, t );
1440 brw_MUL( p, x1y1, x1y1, t );
1441 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1442 brw_MUL( p, x0y0, x0y0, param0 );
1443 brw_MUL( p, x0y1, x0y1, param0 );
1444
1445 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1446 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1447 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1448 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1449
1450 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1451 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1452 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1453 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1454
1455 /* We interpolate between the gradients using the polynomial
1456 6t^5 - 15t^4 + 10t^3 (Perlin). */
1457 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1458 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1459 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1460 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1461 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1462 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1463 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1464 pipeline */
1465 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1466 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1467 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1468 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1469 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1470 pipeline */
1471 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1472 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1473 brw_MUL( p, param0, tmp[ 0 ], param0 );
1474 brw_MUL( p, param1, tmp[ 1 ], param1 );
1475
1476 /* Here we interpolate in the y dimension... */
1477 brw_MUL( p, x0y1, x0y1, param1 );
1478 brw_MUL( p, x1y1, x1y1, param1 );
1479 brw_ADD( p, x0y0, x0y0, x0y1 );
1480 brw_ADD( p, x1y0, x1y0, x1y1 );
1481
1482 /* And now in x. There are horrible register dependencies here,
1483 but we have nothing else to do. */
1484 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1485 brw_MUL( p, x1y0, x1y0, param0 );
1486 brw_ADD( p, x0y0, x0y0, x1y0 );
1487
1488 /* scale by pow( 2, -15 ), as described above */
1489 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1490
1491 release_tmps( c, mark );
1492 }
1493
1494 static void emit_noise2( struct brw_wm_compile *c,
1495 struct prog_instruction *inst )
1496 {
1497 struct brw_compile *p = &c->func;
1498 struct brw_reg src0, src1, param0, param1, dst;
1499 GLuint mask = inst->DstReg.WriteMask;
1500 int i;
1501 int mark = mark_tmps( c );
1502
1503 assert( mark == 0 );
1504
1505 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1506 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1507
1508 param0 = alloc_tmp( c );
1509 param1 = alloc_tmp( c );
1510
1511 brw_MOV( p, param0, src0 );
1512 brw_MOV( p, param1, src1 );
1513
1514 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1515
1516 /* Fill in the result: */
1517 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1518 for (i = 0 ; i < 4; i++) {
1519 if (mask & (1<<i)) {
1520 dst = get_dst_reg(c, inst, i, 1);
1521 brw_MOV( p, dst, param0 );
1522 }
1523 }
1524 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1525 brw_set_saturate( p, 0 );
1526
1527 release_tmps( c, mark );
1528 }
1529
1530 /**
1531 * The three-dimensional case is much like the one- and two- versions above,
1532 * but since the number of corners is rapidly growing we now pack 16 16-bit
1533 * hashes into each register to extract more parallelism from the EUs.
1534 */
1535 static void noise3_sub( struct brw_wm_compile *c ) {
1536
1537 struct brw_compile *p = &c->func;
1538 struct brw_reg param0, param1, param2,
1539 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1540 xi, yi, zi, /* interpolation coefficients */
1541 t, tmp[ 8 ], /* float temporaries */
1542 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1543 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1544 int i;
1545 int mark = mark_tmps( c );
1546
1547 x0y0 = alloc_tmp( c );
1548 x0y1 = alloc_tmp( c );
1549 x1y0 = alloc_tmp( c );
1550 x1y1 = alloc_tmp( c );
1551 xi = alloc_tmp( c );
1552 yi = alloc_tmp( c );
1553 zi = alloc_tmp( c );
1554 t = alloc_tmp( c );
1555 for( i = 0; i < 8; i++ ) {
1556 tmp[ i ] = alloc_tmp( c );
1557 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1558 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1559 }
1560
1561 param0 = lookup_tmp( c, mark - 4 );
1562 param1 = lookup_tmp( c, mark - 3 );
1563 param2 = lookup_tmp( c, mark - 2 );
1564
1565 brw_set_access_mode( p, BRW_ALIGN_1 );
1566
1567 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1568 be hashed. Also compute the remainders (offsets within the unit
1569 cube), interleaved to reduce register dependency penalties. */
1570 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1571 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1572 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1573 brw_FRC( p, param0, param0 );
1574 brw_FRC( p, param1, param1 );
1575 brw_FRC( p, param2, param2 );
1576 /* Since we now have only 16 bits of precision in the hash, we must
1577 be more careful about thorough mixing to maintain entropy as we
1578 squash the input vector into a small scalar. */
1579 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1580 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1581 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1582 brw_imm_uw( 0x9B93 ) );
1583 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1584 brw_imm_uw( 0xBC8F ) );
1585
1586 /* Temporarily disable the execution mask while we work with ExecSize=16
1587 channels (the mask is set for ExecSize=8 and is probably incorrect).
1588 Although this might cause execution of unwanted channels, the code
1589 writes only to temporary registers and has no side effects, so
1590 disabling the mask is harmless. */
1591 brw_push_insn_state( p );
1592 brw_set_mask_control( p, BRW_MASK_DISABLE );
1593 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1594 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1595 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1596
1597 /* We're now ready to perform the hashing. The eight hashes are
1598 interleaved for performance. The hash function used is
1599 designed to rapidly achieve avalanche and require only 16x16
1600 bit multiplication, and 8-bit swizzles (which we get for
1601 free). */
1602 for( i = 0; i < 4; i++ )
1603 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1604 for( i = 0; i < 4; i++ )
1605 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1606 odd_bytes( wtmp[ i ] ) );
1607 for( i = 0; i < 4; i++ )
1608 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1609 for( i = 0; i < 4; i++ )
1610 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1611 odd_bytes( wtmp[ i ] ) );
1612 brw_pop_insn_state( p );
1613
1614 /* Now we want to initialise the four rear gradients based on the
1615 hashes. Format conversion from signed integer to float leaves
1616 everything scaled too high by a factor of pow( 2, 15 ), but
1617 we correct for that right at the end. */
1618 /* x component */
1619 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1620 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1621 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1622 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1623 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1624
1625 brw_push_insn_state( p );
1626 brw_set_mask_control( p, BRW_MASK_DISABLE );
1627 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1628 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1629 brw_pop_insn_state( p );
1630
1631 brw_MUL( p, x1y0, x1y0, t );
1632 brw_MUL( p, x1y1, x1y1, t );
1633 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1634 brw_MUL( p, x0y0, x0y0, param0 );
1635 brw_MUL( p, x0y1, x0y1, param0 );
1636
1637 /* y component */
1638 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1639 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1640 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1641 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1642
1643 brw_push_insn_state( p );
1644 brw_set_mask_control( p, BRW_MASK_DISABLE );
1645 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1646 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1647 brw_pop_insn_state( p );
1648
1649 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1650 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1651 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1652 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1653 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1654
1655 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1656 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1657 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1658 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1659
1660 /* z component */
1661 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1662 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1663 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1664 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1665
1666 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1667 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1668 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1669 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1670
1671 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1672 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1673 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1674 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1675
1676 /* We interpolate between the gradients using the polynomial
1677 6t^5 - 15t^4 + 10t^3 (Perlin). */
1678 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1679 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1680 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1681 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1682 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1683 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1684 brw_MUL( p, xi, xi, param0 );
1685 brw_MUL( p, yi, yi, param1 );
1686 brw_MUL( p, zi, zi, param2 );
1687 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1688 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1689 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1690 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1691 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1692 brw_MUL( p, xi, xi, param0 );
1693 brw_MUL( p, yi, yi, param1 );
1694 brw_MUL( p, zi, zi, param2 );
1695 brw_MUL( p, xi, xi, param0 );
1696 brw_MUL( p, yi, yi, param1 );
1697 brw_MUL( p, zi, zi, param2 );
1698 brw_MUL( p, xi, xi, param0 );
1699 brw_MUL( p, yi, yi, param1 );
1700 brw_MUL( p, zi, zi, param2 );
1701
1702 /* Here we interpolate in the y dimension... */
1703 brw_MUL( p, x0y1, x0y1, yi );
1704 brw_MUL( p, x1y1, x1y1, yi );
1705 brw_ADD( p, x0y0, x0y0, x0y1 );
1706 brw_ADD( p, x1y0, x1y0, x1y1 );
1707
1708 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1709 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1710 brw_MUL( p, x1y0, x1y0, xi );
1711 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1712
1713 /* Now do the same thing for the front four gradients... */
1714 /* x component */
1715 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1716 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1717 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1718 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1719
1720 brw_push_insn_state( p );
1721 brw_set_mask_control( p, BRW_MASK_DISABLE );
1722 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1723 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1724 brw_pop_insn_state( p );
1725
1726 brw_MUL( p, x1y0, x1y0, t );
1727 brw_MUL( p, x1y1, x1y1, t );
1728 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1729 brw_MUL( p, x0y0, x0y0, param0 );
1730 brw_MUL( p, x0y1, x0y1, param0 );
1731
1732 /* y component */
1733 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1734 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1735 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1736 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1737
1738 brw_push_insn_state( p );
1739 brw_set_mask_control( p, BRW_MASK_DISABLE );
1740 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1741 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1742 brw_pop_insn_state( p );
1743
1744 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1745 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1746 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1747 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1748 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1749
1750 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1751 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1752 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1753 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1754
1755 /* z component */
1756 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1757 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1758 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1759 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1760
1761 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1762 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1763 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1764 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1765
1766 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1767 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1768 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1769 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1770
1771 /* The interpolation coefficients are still around from last time, so
1772 again interpolate in the y dimension... */
1773 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1774 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1775 brw_MUL( p, x0y1, x0y1, yi );
1776 brw_MUL( p, x1y1, x1y1, yi );
1777 brw_ADD( p, x0y0, x0y0, x0y1 );
1778 brw_ADD( p, x1y0, x1y0, x1y1 );
1779
1780 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1781 time put the front face in tmp[ 1 ] and we're nearly there... */
1782 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1783 brw_MUL( p, x1y0, x1y0, xi );
1784 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1785
1786 /* The final interpolation, in the z dimension: */
1787 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1788 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1789 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1790
1791 /* scale by pow( 2, -15 ), as described above */
1792 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1793
1794 release_tmps( c, mark );
1795 }
1796
1797 static void emit_noise3( struct brw_wm_compile *c,
1798 struct prog_instruction *inst )
1799 {
1800 struct brw_compile *p = &c->func;
1801 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1802 GLuint mask = inst->DstReg.WriteMask;
1803 int i;
1804 int mark = mark_tmps( c );
1805
1806 assert( mark == 0 );
1807
1808 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1809 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1810 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
1811
1812 param0 = alloc_tmp( c );
1813 param1 = alloc_tmp( c );
1814 param2 = alloc_tmp( c );
1815
1816 brw_MOV( p, param0, src0 );
1817 brw_MOV( p, param1, src1 );
1818 brw_MOV( p, param2, src2 );
1819
1820 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1821
1822 /* Fill in the result: */
1823 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1824 for (i = 0 ; i < 4; i++) {
1825 if (mask & (1<<i)) {
1826 dst = get_dst_reg(c, inst, i, 1);
1827 brw_MOV( p, dst, param0 );
1828 }
1829 }
1830 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1831 brw_set_saturate( p, 0 );
1832
1833 release_tmps( c, mark );
1834 }
1835
1836 /**
1837 * For the four-dimensional case, the little micro-optimisation benefits
1838 * we obtain by unrolling all the loops aren't worth the massive bloat it
1839 * now causes. Instead, we loop twice around performing a similar operation
1840 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1841 * code to glue it all together.
1842 */
1843 static void noise4_sub( struct brw_wm_compile *c )
1844 {
1845 struct brw_compile *p = &c->func;
1846 struct brw_reg param[ 4 ],
1847 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1848 w0, /* noise for the w=0 cube */
1849 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1850 interp[ 4 ], /* interpolation coefficients */
1851 t, tmp[ 8 ], /* float temporaries */
1852 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1853 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1854 int i, j;
1855 int mark = mark_tmps( c );
1856 GLuint loop, origin;
1857
1858 x0y0 = alloc_tmp( c );
1859 x0y1 = alloc_tmp( c );
1860 x1y0 = alloc_tmp( c );
1861 x1y1 = alloc_tmp( c );
1862 t = alloc_tmp( c );
1863 w0 = alloc_tmp( c );
1864 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1865 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1866
1867 for( i = 0; i < 4; i++ ) {
1868 param[ i ] = lookup_tmp( c, mark - 5 + i );
1869 interp[ i ] = alloc_tmp( c );
1870 }
1871
1872 for( i = 0; i < 8; i++ ) {
1873 tmp[ i ] = alloc_tmp( c );
1874 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1875 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1876 }
1877
1878 brw_set_access_mode( p, BRW_ALIGN_1 );
1879
1880 /* We only want 16 bits of precision from the integral part of each
1881 co-ordinate, but unfortunately the RNDD semantics would saturate
1882 at 16 bits if we performed the operation directly to a 16-bit
1883 destination. Therefore, we round to 32-bit temporaries where
1884 appropriate, and then store only the lower 16 bits. */
1885 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1886 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1887 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1888 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1889 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1890 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1891
1892 /* Modify the flag register here, because the side effect is useful
1893 later (see below). We know for certain that all flags will be
1894 cleared, since the FRC instruction cannot possibly generate
1895 negative results. Even for exceptional inputs (infinities, denormals,
1896 NaNs), the architecture guarantees that the L conditional is false. */
1897 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1898 brw_FRC( p, param[ 0 ], param[ 0 ] );
1899 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1900 for( i = 1; i < 4; i++ )
1901 brw_FRC( p, param[ i ], param[ i ] );
1902
1903 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1904 of all. */
1905 for( i = 0; i < 4; i++ )
1906 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1907 for( i = 0; i < 4; i++ )
1908 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1909 for( i = 0; i < 4; i++ )
1910 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1911 for( i = 0; i < 4; i++ )
1912 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1913 for( j = 0; j < 3; j++ )
1914 for( i = 0; i < 4; i++ )
1915 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1916
1917 /* Mark the current address, as it will be a jump destination. The
1918 following code will be executed twice: first, with the flag
1919 register clear indicating the w=0 case, and second with flags
1920 set for w=1. */
1921 loop = p->nr_insn;
1922
1923 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1924 be hashed. Since we have only 16 bits of precision in the hash, we
1925 must be careful about thorough mixing to maintain entropy as we
1926 squash the input vector into a small scalar. */
1927 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1928 brw_imm_uw( 0xBC8F ) );
1929 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1930 brw_imm_uw( 0xD0BD ) );
1931 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1932 brw_imm_uw( 0x9B93 ) );
1933 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1934 brw_imm_uw( 0xA359 ) );
1935 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1936 brw_imm_uw( 0xBC8F ) );
1937
1938 /* Temporarily disable the execution mask while we work with ExecSize=16
1939 channels (the mask is set for ExecSize=8 and is probably incorrect).
1940 Although this might cause execution of unwanted channels, the code
1941 writes only to temporary registers and has no side effects, so
1942 disabling the mask is harmless. */
1943 brw_push_insn_state( p );
1944 brw_set_mask_control( p, BRW_MASK_DISABLE );
1945 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1946 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1947 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1948
1949 /* We're now ready to perform the hashing. The eight hashes are
1950 interleaved for performance. The hash function used is
1951 designed to rapidly achieve avalanche and require only 16x16
1952 bit multiplication, and 8-bit swizzles (which we get for
1953 free). */
1954 for( i = 0; i < 4; i++ )
1955 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1956 for( i = 0; i < 4; i++ )
1957 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1958 odd_bytes( wtmp[ i ] ) );
1959 for( i = 0; i < 4; i++ )
1960 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1961 for( i = 0; i < 4; i++ )
1962 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1963 odd_bytes( wtmp[ i ] ) );
1964 brw_pop_insn_state( p );
1965
1966 /* Now we want to initialise the four rear gradients based on the
1967 hashes. Format conversion from signed integer to float leaves
1968 everything scaled too high by a factor of pow( 2, 15 ), but
1969 we correct for that right at the end. */
1970 /* x component */
1971 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1972 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1973 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1974 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1975 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1976
1977 brw_push_insn_state( p );
1978 brw_set_mask_control( p, BRW_MASK_DISABLE );
1979 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1980 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1981 brw_pop_insn_state( p );
1982
1983 brw_MUL( p, x1y0, x1y0, t );
1984 brw_MUL( p, x1y1, x1y1, t );
1985 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1986 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1987 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1988
1989 /* y component */
1990 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1991 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1992 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1993 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1994
1995 brw_push_insn_state( p );
1996 brw_set_mask_control( p, BRW_MASK_DISABLE );
1997 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1998 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1999 brw_pop_insn_state( p );
2000
2001 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2002 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2003 /* prepare t for the w component (used below): w the first time through
2004 the loop; w - 1 the second time) */
2005 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2006 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2007 p->current->header.predicate_inverse = 1;
2008 brw_MOV( p, t, param[ 3 ] );
2009 p->current->header.predicate_inverse = 0;
2010 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2011 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2012 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2013
2014 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2015 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2016 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2017 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2018
2019 /* z component */
2020 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2021 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2022 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2023 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2024
2025 brw_push_insn_state( p );
2026 brw_set_mask_control( p, BRW_MASK_DISABLE );
2027 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
2028 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
2029 brw_pop_insn_state( p );
2030
2031 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
2032 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
2033 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
2034 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
2035
2036 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2037 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2038 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2039 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2040
2041 /* w component */
2042 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
2043 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
2044 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
2045 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
2046
2047 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2048 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2049 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2050 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2051 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
2052
2053 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2054 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2055 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2056 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2057
2058 /* Here we interpolate in the y dimension... */
2059 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2060 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2061 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2062 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2063 brw_ADD( p, x0y0, x0y0, x0y1 );
2064 brw_ADD( p, x1y0, x1y0, x1y1 );
2065
2066 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
2067 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2068 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2069 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
2070
2071 /* Now do the same thing for the front four gradients... */
2072 /* x component */
2073 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
2074 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
2075 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
2076 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
2077
2078 brw_push_insn_state( p );
2079 brw_set_mask_control( p, BRW_MASK_DISABLE );
2080 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2081 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2082 brw_pop_insn_state( p );
2083
2084 brw_MUL( p, x1y0, x1y0, t );
2085 brw_MUL( p, x1y1, x1y1, t );
2086 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
2087 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
2088 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
2089
2090 /* y component */
2091 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2092 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2093 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2094 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2095
2096 brw_push_insn_state( p );
2097 brw_set_mask_control( p, BRW_MASK_DISABLE );
2098 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2099 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2100 brw_pop_insn_state( p );
2101
2102 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2103 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2104 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
2105 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
2106 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
2107
2108 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2109 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2110 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2111 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2112
2113 /* z component */
2114 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2115 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2116 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2117 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2118
2119 brw_push_insn_state( p );
2120 brw_set_mask_control( p, BRW_MASK_DISABLE );
2121 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
2122 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
2123 brw_pop_insn_state( p );
2124
2125 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2126 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2127 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2128 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2129 /* prepare t for the w component (used below): w the first time through
2130 the loop; w - 1 the second time) */
2131 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2132 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
2133 p->current->header.predicate_inverse = 1;
2134 brw_MOV( p, t, param[ 3 ] );
2135 p->current->header.predicate_inverse = 0;
2136 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
2137
2138 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2139 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2140 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2141 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2142
2143 /* w component */
2144 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
2145 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
2146 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
2147 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
2148
2149 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
2150 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
2151 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
2152 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
2153
2154 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
2155 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
2156 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2157 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2158
2159 /* Interpolate in the y dimension: */
2160 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2161 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2162 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2163 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2164 brw_ADD( p, x0y0, x0y0, x0y1 );
2165 brw_ADD( p, x1y0, x1y0, x1y1 );
2166
2167 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2168 time put the front face in tmp[ 1 ] and we're nearly there... */
2169 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2170 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2171 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2172
2173 /* Another interpolation, in the z dimension: */
2174 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2175 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2176 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2177
2178 /* Exit the loop if we've computed both cubes... */
2179 origin = p->nr_insn;
2180 brw_push_insn_state( p );
2181 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2182 brw_set_mask_control( p, BRW_MASK_DISABLE );
2183 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2184 brw_pop_insn_state( p );
2185
2186 /* Save the result for the w=0 case, and increment the w coordinate: */
2187 brw_MOV( p, w0, tmp[ 0 ] );
2188 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2189 brw_imm_uw( 1 ) );
2190
2191 /* Loop around for the other cube. Explicitly set the flag register
2192 (unfortunately we must spend an extra instruction to do this: we
2193 can't rely on a side effect of the previous MOV or ADD because
2194 conditional modifiers which are normally true might be false in
2195 exceptional circumstances, e.g. given a NaN input; the add to
2196 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2197 brw_push_insn_state( p );
2198 brw_set_mask_control( p, BRW_MASK_DISABLE );
2199 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2200 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2201 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2202 brw_pop_insn_state( p );
2203
2204 /* Patch the previous conditional branch now that we know the
2205 destination address. */
2206 brw_set_src1( p->store + origin,
2207 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2208
2209 /* The very last interpolation. */
2210 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2211 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2212 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2213
2214 /* scale by pow( 2, -15 ), as described above */
2215 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2216
2217 release_tmps( c, mark );
2218 }
2219
2220 static void emit_noise4( struct brw_wm_compile *c,
2221 struct prog_instruction *inst )
2222 {
2223 struct brw_compile *p = &c->func;
2224 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2225 GLuint mask = inst->DstReg.WriteMask;
2226 int i;
2227 int mark = mark_tmps( c );
2228
2229 assert( mark == 0 );
2230
2231 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
2232 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
2233 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
2234 src3 = get_src_reg( c, inst->SrcReg, 3, 1 );
2235
2236 param0 = alloc_tmp( c );
2237 param1 = alloc_tmp( c );
2238 param2 = alloc_tmp( c );
2239 param3 = alloc_tmp( c );
2240
2241 brw_MOV( p, param0, src0 );
2242 brw_MOV( p, param1, src1 );
2243 brw_MOV( p, param2, src2 );
2244 brw_MOV( p, param3, src3 );
2245
2246 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2247
2248 /* Fill in the result: */
2249 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2250 for (i = 0 ; i < 4; i++) {
2251 if (mask & (1<<i)) {
2252 dst = get_dst_reg(c, inst, i, 1);
2253 brw_MOV( p, dst, param0 );
2254 }
2255 }
2256 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2257 brw_set_saturate( p, 0 );
2258
2259 release_tmps( c, mark );
2260 }
2261
2262 static void emit_wpos_xy(struct brw_wm_compile *c,
2263 struct prog_instruction *inst)
2264 {
2265 struct brw_compile *p = &c->func;
2266 GLuint mask = inst->DstReg.WriteMask;
2267 struct brw_reg src0[2], dst[2];
2268
2269 dst[0] = get_dst_reg(c, inst, 0, 1);
2270 dst[1] = get_dst_reg(c, inst, 1, 1);
2271
2272 src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
2273 src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
2274
2275 /* Calculate the pixel offset from window bottom left into destination
2276 * X and Y channels.
2277 */
2278 if (mask & WRITEMASK_X) {
2279 /* X' = X - origin_x */
2280 brw_ADD(p,
2281 dst[0],
2282 retype(src0[0], BRW_REGISTER_TYPE_W),
2283 brw_imm_d(0 - c->key.origin_x));
2284 }
2285
2286 if (mask & WRITEMASK_Y) {
2287 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2288 brw_ADD(p,
2289 dst[1],
2290 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2291 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2292 }
2293 }
2294
2295 /* TODO
2296 BIAS on SIMD8 not workind yet...
2297 */
2298 static void emit_txb(struct brw_wm_compile *c,
2299 struct prog_instruction *inst)
2300 {
2301 struct brw_compile *p = &c->func;
2302 struct brw_reg dst[4], src[4], payload_reg;
2303 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2304
2305 GLuint i;
2306 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2307 for (i = 0; i < 4; i++)
2308 dst[i] = get_dst_reg(c, inst, i, 1);
2309 for (i = 0; i < 4; i++)
2310 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2311
2312 switch (inst->TexSrcTarget) {
2313 case TEXTURE_1D_INDEX:
2314 brw_MOV(p, brw_message_reg(2), src[0]);
2315 brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
2316 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2317 break;
2318 case TEXTURE_2D_INDEX:
2319 case TEXTURE_RECT_INDEX:
2320 brw_MOV(p, brw_message_reg(2), src[0]);
2321 brw_MOV(p, brw_message_reg(3), src[1]);
2322 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2323 break;
2324 default:
2325 brw_MOV(p, brw_message_reg(2), src[0]);
2326 brw_MOV(p, brw_message_reg(3), src[1]);
2327 brw_MOV(p, brw_message_reg(4), src[2]);
2328 break;
2329 }
2330 brw_MOV(p, brw_message_reg(5), src[3]);
2331 brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
2332 brw_SAMPLE(p,
2333 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2334 1,
2335 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2336 unit + MAX_DRAW_BUFFERS, /* surface */
2337 unit, /* sampler */
2338 inst->DstReg.WriteMask,
2339 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
2340 4,
2341 4,
2342 0);
2343 }
2344
2345 static void emit_tex(struct brw_wm_compile *c,
2346 struct prog_instruction *inst)
2347 {
2348 struct brw_compile *p = &c->func;
2349 struct brw_reg dst[4], src[4], payload_reg;
2350 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2351
2352 GLuint msg_len;
2353 GLuint i, nr;
2354 GLuint emit;
2355 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2356
2357 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2358
2359 for (i = 0; i < 4; i++)
2360 dst[i] = get_dst_reg(c, inst, i, 1);
2361 for (i = 0; i < 4; i++)
2362 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2363
2364
2365 switch (inst->TexSrcTarget) {
2366 case TEXTURE_1D_INDEX:
2367 emit = WRITEMASK_X;
2368 nr = 1;
2369 break;
2370 case TEXTURE_2D_INDEX:
2371 case TEXTURE_RECT_INDEX:
2372 emit = WRITEMASK_XY;
2373 nr = 2;
2374 break;
2375 default:
2376 emit = WRITEMASK_XYZ;
2377 nr = 3;
2378 break;
2379 }
2380 msg_len = 1;
2381
2382 for (i = 0; i < nr; i++) {
2383 static const GLuint swz[4] = {0,1,2,2};
2384 if (emit & (1<<i))
2385 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2386 else
2387 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2388 msg_len += 1;
2389 }
2390
2391 if (shadow) {
2392 brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
2393 brw_MOV(p, brw_message_reg(6), src[2]);
2394 }
2395
2396 brw_SAMPLE(p,
2397 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2398 1,
2399 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2400 unit + MAX_DRAW_BUFFERS, /* surface */
2401 unit, /* sampler */
2402 inst->DstReg.WriteMask,
2403 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
2404 4,
2405 shadow ? 6 : 4,
2406 0);
2407
2408 if (shadow)
2409 brw_MOV(p, dst[3], brw_imm_f(1.0));
2410 }
2411
2412 /**
2413 * Resolve subroutine calls after code emit is done.
2414 */
2415 static void post_wm_emit( struct brw_wm_compile *c )
2416 {
2417 brw_resolve_cals(&c->func);
2418 }
2419
2420 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2421 {
2422 #define MAX_IFSN 32
2423 #define MAX_LOOP_DEPTH 32
2424 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2425 struct brw_instruction *inst0, *inst1;
2426 int i, if_insn = 0, loop_insn = 0;
2427 struct brw_compile *p = &c->func;
2428 struct brw_indirect stack_index = brw_indirect(0, 0);
2429
2430 c->reg_index = 0;
2431 prealloc_reg(c);
2432 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2433 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2434
2435 for (i = 0; i < c->nr_fp_insns; i++) {
2436 struct prog_instruction *inst = &c->prog_instructions[i];
2437
2438 if (inst->CondUpdate)
2439 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2440 else
2441 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2442
2443 switch (inst->Opcode) {
2444 case WM_PIXELXY:
2445 emit_pixel_xy(c, inst);
2446 break;
2447 case WM_DELTAXY:
2448 emit_delta_xy(c, inst);
2449 break;
2450 case WM_PIXELW:
2451 emit_pixel_w(c, inst);
2452 break;
2453 case WM_LINTERP:
2454 emit_linterp(c, inst);
2455 break;
2456 case WM_PINTERP:
2457 emit_pinterp(c, inst);
2458 break;
2459 case WM_CINTERP:
2460 emit_cinterp(c, inst);
2461 break;
2462 case WM_WPOSXY:
2463 emit_wpos_xy(c, inst);
2464 break;
2465 case WM_FB_WRITE:
2466 emit_fb_write(c, inst);
2467 break;
2468 case WM_FRONTFACING:
2469 emit_frontfacing(c, inst);
2470 break;
2471 case OPCODE_ABS:
2472 emit_abs(c, inst);
2473 break;
2474 case OPCODE_ADD:
2475 emit_add(c, inst);
2476 break;
2477 case OPCODE_SUB:
2478 emit_sub(c, inst);
2479 break;
2480 case OPCODE_FRC:
2481 emit_frc(c, inst);
2482 break;
2483 case OPCODE_FLR:
2484 emit_flr(c, inst);
2485 break;
2486 case OPCODE_LRP:
2487 emit_lrp(c, inst);
2488 break;
2489 case OPCODE_TRUNC:
2490 emit_trunc(c, inst);
2491 break;
2492 case OPCODE_MOV:
2493 emit_mov(c, inst);
2494 break;
2495 case OPCODE_DP3:
2496 emit_dp3(c, inst);
2497 break;
2498 case OPCODE_DP4:
2499 emit_dp4(c, inst);
2500 break;
2501 case OPCODE_XPD:
2502 emit_xpd(c, inst);
2503 break;
2504 case OPCODE_DPH:
2505 emit_dph(c, inst);
2506 break;
2507 case OPCODE_RCP:
2508 emit_rcp(c, inst);
2509 break;
2510 case OPCODE_RSQ:
2511 emit_rsq(c, inst);
2512 break;
2513 case OPCODE_SIN:
2514 emit_sin(c, inst);
2515 break;
2516 case OPCODE_COS:
2517 emit_cos(c, inst);
2518 break;
2519 case OPCODE_EX2:
2520 emit_ex2(c, inst);
2521 break;
2522 case OPCODE_LG2:
2523 emit_lg2(c, inst);
2524 break;
2525 case OPCODE_MAX:
2526 emit_max(c, inst);
2527 break;
2528 case OPCODE_MIN:
2529 emit_min(c, inst);
2530 break;
2531 case OPCODE_DDX:
2532 emit_ddx(c, inst);
2533 break;
2534 case OPCODE_DDY:
2535 emit_ddy(c, inst);
2536 break;
2537 case OPCODE_SLT:
2538 emit_slt(c, inst);
2539 break;
2540 case OPCODE_SLE:
2541 emit_sle(c, inst);
2542 break;
2543 case OPCODE_SGT:
2544 emit_sgt(c, inst);
2545 break;
2546 case OPCODE_SGE:
2547 emit_sge(c, inst);
2548 break;
2549 case OPCODE_SEQ:
2550 emit_seq(c, inst);
2551 break;
2552 case OPCODE_SNE:
2553 emit_sne(c, inst);
2554 break;
2555 case OPCODE_MUL:
2556 emit_mul(c, inst);
2557 break;
2558 case OPCODE_POW:
2559 emit_pow(c, inst);
2560 break;
2561 case OPCODE_MAD:
2562 emit_mad(c, inst);
2563 break;
2564 case OPCODE_NOISE1:
2565 emit_noise1(c, inst);
2566 break;
2567 case OPCODE_NOISE2:
2568 emit_noise2(c, inst);
2569 break;
2570 case OPCODE_NOISE3:
2571 emit_noise3(c, inst);
2572 break;
2573 case OPCODE_NOISE4:
2574 emit_noise4(c, inst);
2575 break;
2576 case OPCODE_TEX:
2577 emit_tex(c, inst);
2578 break;
2579 case OPCODE_TXB:
2580 emit_txb(c, inst);
2581 break;
2582 case OPCODE_KIL_NV:
2583 emit_kil(c);
2584 break;
2585 case OPCODE_IF:
2586 assert(if_insn < MAX_IFSN);
2587 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2588 break;
2589 case OPCODE_ELSE:
2590 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2591 break;
2592 case OPCODE_ENDIF:
2593 assert(if_insn > 0);
2594 brw_ENDIF(p, if_inst[--if_insn]);
2595 break;
2596 case OPCODE_BGNSUB:
2597 brw_save_label(p, inst->Comment, p->nr_insn);
2598 break;
2599 case OPCODE_ENDSUB:
2600 /* no-op */
2601 break;
2602 case OPCODE_CAL:
2603 brw_push_insn_state(p);
2604 brw_set_mask_control(p, BRW_MASK_DISABLE);
2605 brw_set_access_mode(p, BRW_ALIGN_1);
2606 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2607 brw_set_access_mode(p, BRW_ALIGN_16);
2608 brw_ADD(p, get_addr_reg(stack_index),
2609 get_addr_reg(stack_index), brw_imm_d(4));
2610 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2611 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2612 brw_pop_insn_state(p);
2613 break;
2614
2615 case OPCODE_RET:
2616 brw_push_insn_state(p);
2617 brw_set_mask_control(p, BRW_MASK_DISABLE);
2618 brw_ADD(p, get_addr_reg(stack_index),
2619 get_addr_reg(stack_index), brw_imm_d(-4));
2620 brw_set_access_mode(p, BRW_ALIGN_1);
2621 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2622 brw_set_access_mode(p, BRW_ALIGN_16);
2623 brw_pop_insn_state(p);
2624
2625 break;
2626 case OPCODE_BGNLOOP:
2627 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2628 break;
2629 case OPCODE_BRK:
2630 brw_BREAK(p);
2631 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2632 break;
2633 case OPCODE_CONT:
2634 brw_CONT(p);
2635 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2636 break;
2637 case OPCODE_ENDLOOP:
2638 loop_insn--;
2639 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2640 /* patch all the BREAK instructions from
2641 last BEGINLOOP */
2642 while (inst0 > loop_inst[loop_insn]) {
2643 inst0--;
2644 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2645 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2646 inst0->bits3.if_else.pop_count = 0;
2647 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2648 inst0->bits3.if_else.jump_count = inst1 - inst0;
2649 inst0->bits3.if_else.pop_count = 0;
2650 }
2651 }
2652 break;
2653 default:
2654 _mesa_printf("unsupported IR in fragment shader %d\n",
2655 inst->Opcode);
2656 }
2657 if (inst->CondUpdate)
2658 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2659 else
2660 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2661 }
2662 post_wm_emit(c);
2663
2664 if (c->reg_index >= BRW_WM_MAX_GRF) {
2665 _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
2666 /* XXX we need to do some proper error recovery here */
2667 }
2668 }
2669
2670
2671 /**
2672 * Do GPU code generation for shaders that use GLSL features such as
2673 * flow control. Other shaders will be compiled with the
2674 */
2675 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2676 {
2677 if (INTEL_DEBUG & DEBUG_WM) {
2678 _mesa_printf("brw_wm_glsl_emit:\n");
2679 }
2680
2681 /* initial instruction translation/simplification */
2682 brw_wm_pass_fp(c);
2683
2684 /* actual code generation */
2685 brw_wm_emit_glsl(brw, c);
2686
2687 if (INTEL_DEBUG & DEBUG_WM) {
2688 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2689 }
2690
2691 c->prog_data.total_grf = c->reg_index;
2692 c->prog_data.total_scratch = 0;
2693 }