radeon/r200/r300: cleanup some of the renderbuffer code
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11 /* Only guess, need a flag in gl_fragment_program later */
12 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
13 {
14 int i;
15 for (i = 0; i < fp->Base.NumInstructions; i++) {
16 struct prog_instruction *inst = &fp->Base.Instructions[i];
17 switch (inst->Opcode) {
18 case OPCODE_IF:
19 case OPCODE_TRUNC:
20 case OPCODE_ENDIF:
21 case OPCODE_CAL:
22 case OPCODE_BRK:
23 case OPCODE_RET:
24 case OPCODE_DDX:
25 case OPCODE_DDY:
26 case OPCODE_NOISE1:
27 case OPCODE_NOISE2:
28 case OPCODE_NOISE3:
29 case OPCODE_NOISE4:
30 case OPCODE_BGNLOOP:
31 return GL_TRUE;
32 default:
33 break;
34 }
35 }
36 return GL_FALSE;
37 }
38
39 static void set_reg(struct brw_wm_compile *c, int file, int index,
40 int component, struct brw_reg reg)
41 {
42 c->wm_regs[file][index][component].reg = reg;
43 c->wm_regs[file][index][component].inited = GL_TRUE;
44 }
45
46 static int get_scalar_dst_index(struct prog_instruction *inst)
47 {
48 int i;
49 for (i = 0; i < 4; i++)
50 if (inst->DstReg.WriteMask & (1<<i))
51 break;
52 return i;
53 }
54
55 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
56 {
57 struct brw_reg reg;
58 if(c->tmp_index == c->tmp_max)
59 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
60
61 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
62 return reg;
63 }
64
65 static int mark_tmps(struct brw_wm_compile *c)
66 {
67 return c->tmp_index;
68 }
69
70 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
71 {
72 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
73 }
74
75 static void release_tmps(struct brw_wm_compile *c, int mark)
76 {
77 c->tmp_index = mark;
78 }
79
80 static struct brw_reg
81 get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs)
82 {
83 struct brw_reg reg;
84 switch (file) {
85 case PROGRAM_STATE_VAR:
86 case PROGRAM_CONSTANT:
87 case PROGRAM_UNIFORM:
88 file = PROGRAM_STATE_VAR;
89 break;
90 case PROGRAM_UNDEFINED:
91 return brw_null_reg();
92 default:
93 break;
94 }
95
96 if(c->wm_regs[file][index][component].inited)
97 reg = c->wm_regs[file][index][component].reg;
98 else
99 reg = brw_vec8_grf(c->reg_index, 0);
100
101 if(!c->wm_regs[file][index][component].inited) {
102 set_reg(c, file, index, component, reg);
103 c->reg_index++;
104 }
105
106 if (neg & (1<< component)) {
107 reg = negate(reg);
108 }
109 if (abs)
110 reg = brw_abs(reg);
111 return reg;
112 }
113
114 static void prealloc_reg(struct brw_wm_compile *c)
115 {
116 int i, j;
117 struct brw_reg reg;
118 int nr_interp_regs = 0;
119 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
120
121 for (i = 0; i < 4; i++) {
122 reg = (i < c->key.nr_depth_regs)
123 ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
124 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
125 }
126 c->reg_index += 2*c->key.nr_depth_regs;
127 {
128 int nr_params = c->fp->program.Base.Parameters->NumParameters;
129 struct gl_program_parameter_list *plist =
130 c->fp->program.Base.Parameters;
131 int index = 0;
132 c->prog_data.nr_params = 4*nr_params;
133 for (i = 0; i < nr_params; i++) {
134 for (j = 0; j < 4; j++, index++) {
135 reg = brw_vec1_grf(c->reg_index + index/8,
136 index%8);
137 c->prog_data.param[index] =
138 &plist->ParameterValues[i][j];
139 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
140 }
141 }
142 c->nr_creg = 2*((4*nr_params+15)/16);
143 c->reg_index += c->nr_creg;
144 }
145 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
146 if (inputs & (1<<i)) {
147 nr_interp_regs++;
148 reg = brw_vec8_grf(c->reg_index, 0);
149 for (j = 0; j < 4; j++)
150 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
151 c->reg_index += 2;
152
153 }
154 }
155 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
156 c->prog_data.urb_read_length = nr_interp_regs * 2;
157 c->prog_data.curb_read_length = c->nr_creg;
158 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
159 c->reg_index++;
160 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
161 c->reg_index += 2;
162 }
163
164 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
165 struct prog_instruction *inst, int component, int nr)
166 {
167 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
168 0, 0);
169 }
170
171 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
172 struct prog_src_register *src, int index, int nr)
173 {
174 int component = GET_SWZ(src->Swizzle, index);
175 return get_reg(c, src->File, src->Index, component, nr,
176 src->NegateBase, src->Abs);
177 }
178
179 /* Subroutines are minimal support for resusable instruction sequences.
180 They are implemented as simply as possible to minimise overhead: there
181 is no explicit support for communication between the caller and callee
182 other than saving the return address in a temporary register, nor is
183 there any automatic local storage. This implies that great care is
184 required before attempting reentrancy or any kind of nested
185 subroutine invocations. */
186 static void invoke_subroutine( struct brw_wm_compile *c,
187 enum _subroutine subroutine,
188 void (*emit)( struct brw_wm_compile * ) )
189 {
190 struct brw_compile *p = &c->func;
191
192 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
193
194 if( c->subroutines[ subroutine ] ) {
195 /* subroutine previously emitted: reuse existing instructions */
196
197 int mark = mark_tmps( c );
198 struct brw_reg return_address = retype( alloc_tmp( c ),
199 BRW_REGISTER_TYPE_UD );
200 int here = p->nr_insn;
201
202 brw_push_insn_state(p);
203 brw_set_mask_control(p, BRW_MASK_DISABLE);
204 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
205
206 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
207 brw_imm_d( ( c->subroutines[ subroutine ] -
208 here - 1 ) << 4 ) );
209 brw_pop_insn_state(p);
210
211 release_tmps( c, mark );
212 } else {
213 /* previously unused subroutine: emit, and mark for later reuse */
214
215 int mark = mark_tmps( c );
216 struct brw_reg return_address = retype( alloc_tmp( c ),
217 BRW_REGISTER_TYPE_UD );
218 struct brw_instruction *calc;
219 int base = p->nr_insn;
220
221 brw_push_insn_state(p);
222 brw_set_mask_control(p, BRW_MASK_DISABLE);
223 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
224 brw_pop_insn_state(p);
225
226 c->subroutines[ subroutine ] = p->nr_insn;
227
228 emit( c );
229
230 brw_push_insn_state(p);
231 brw_set_mask_control(p, BRW_MASK_DISABLE);
232 brw_MOV( p, brw_ip_reg(), return_address );
233 brw_pop_insn_state(p);
234
235 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
236
237 release_tmps( c, mark );
238 }
239 }
240
241 static void emit_abs( struct brw_wm_compile *c,
242 struct prog_instruction *inst)
243 {
244 int i;
245 struct brw_compile *p = &c->func;
246 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
247 for (i = 0; i < 4; i++) {
248 if (inst->DstReg.WriteMask & (1<<i)) {
249 struct brw_reg src, dst;
250 dst = get_dst_reg(c, inst, i, 1);
251 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
252 brw_MOV(p, dst, brw_abs(src));
253 }
254 }
255 brw_set_saturate(p, 0);
256 }
257
258 static void emit_trunc( struct brw_wm_compile *c,
259 struct prog_instruction *inst)
260 {
261 int i;
262 struct brw_compile *p = &c->func;
263 GLuint mask = inst->DstReg.WriteMask;
264 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
265 for (i = 0; i < 4; i++) {
266 if (mask & (1<<i)) {
267 struct brw_reg src, dst;
268 dst = get_dst_reg(c, inst, i, 1) ;
269 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
270 brw_RNDZ(p, dst, src);
271 }
272 }
273 brw_set_saturate(p, 0);
274 }
275
276 static void emit_mov( struct brw_wm_compile *c,
277 struct prog_instruction *inst)
278 {
279 int i;
280 struct brw_compile *p = &c->func;
281 GLuint mask = inst->DstReg.WriteMask;
282 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
283 for (i = 0; i < 4; i++) {
284 if (mask & (1<<i)) {
285 struct brw_reg src, dst;
286 dst = get_dst_reg(c, inst, i, 1);
287 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
288 brw_MOV(p, dst, src);
289 }
290 }
291 brw_set_saturate(p, 0);
292 }
293
294 static void emit_pixel_xy(struct brw_wm_compile *c,
295 struct prog_instruction *inst)
296 {
297 struct brw_reg r1 = brw_vec1_grf(1, 0);
298 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
299
300 struct brw_reg dst0, dst1;
301 struct brw_compile *p = &c->func;
302 GLuint mask = inst->DstReg.WriteMask;
303
304 dst0 = get_dst_reg(c, inst, 0, 1);
305 dst1 = get_dst_reg(c, inst, 1, 1);
306 /* Calculate pixel centers by adding 1 or 0 to each of the
307 * micro-tile coordinates passed in r1.
308 */
309 if (mask & WRITEMASK_X) {
310 brw_ADD(p,
311 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
312 stride(suboffset(r1_uw, 4), 2, 4, 0),
313 brw_imm_v(0x10101010));
314 }
315
316 if (mask & WRITEMASK_Y) {
317 brw_ADD(p,
318 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
319 stride(suboffset(r1_uw, 5), 2, 4, 0),
320 brw_imm_v(0x11001100));
321 }
322
323 }
324
325 static void emit_delta_xy(struct brw_wm_compile *c,
326 struct prog_instruction *inst)
327 {
328 struct brw_reg r1 = brw_vec1_grf(1, 0);
329 struct brw_reg dst0, dst1, src0, src1;
330 struct brw_compile *p = &c->func;
331 GLuint mask = inst->DstReg.WriteMask;
332
333 dst0 = get_dst_reg(c, inst, 0, 1);
334 dst1 = get_dst_reg(c, inst, 1, 1);
335 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
336 src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
337 /* Calc delta X,Y by subtracting origin in r1 from the pixel
338 * centers.
339 */
340 if (mask & WRITEMASK_X) {
341 brw_ADD(p,
342 dst0,
343 retype(src0, BRW_REGISTER_TYPE_UW),
344 negate(r1));
345 }
346
347 if (mask & WRITEMASK_Y) {
348 brw_ADD(p,
349 dst1,
350 retype(src1, BRW_REGISTER_TYPE_UW),
351 negate(suboffset(r1,1)));
352
353 }
354
355 }
356
357
358 static void fire_fb_write( struct brw_wm_compile *c,
359 GLuint base_reg,
360 GLuint nr,
361 GLuint target,
362 GLuint eot)
363 {
364 struct brw_compile *p = &c->func;
365 /* Pass through control information:
366 */
367 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
368 {
369 brw_push_insn_state(p);
370 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
371 brw_MOV(p,
372 brw_message_reg(base_reg + 1),
373 brw_vec8_grf(1, 0));
374 brw_pop_insn_state(p);
375 }
376 /* Send framebuffer write message: */
377 brw_fb_WRITE(p,
378 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
379 base_reg,
380 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
381 target,
382 nr,
383 0,
384 eot);
385 }
386
387 static void emit_fb_write(struct brw_wm_compile *c,
388 struct prog_instruction *inst)
389 {
390 struct brw_compile *p = &c->func;
391 int nr = 2;
392 int channel;
393 GLuint target, eot;
394 struct brw_reg src0;
395
396 /* Reserve a space for AA - may not be needed:
397 */
398 if (c->key.aa_dest_stencil_reg)
399 nr += 1;
400 {
401 brw_push_insn_state(p);
402 for (channel = 0; channel < 4; channel++) {
403 src0 = get_src_reg(c, &inst->SrcReg[0], channel, 1);
404 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
405 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
406 brw_MOV(p, brw_message_reg(nr + channel), src0);
407 }
408 /* skip over the regs populated above: */
409 nr += 8;
410 brw_pop_insn_state(p);
411 }
412
413 if (c->key.source_depth_to_render_target)
414 {
415 if (c->key.computes_depth) {
416 src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
417 brw_MOV(p, brw_message_reg(nr), src0);
418 } else {
419 src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
420 brw_MOV(p, brw_message_reg(nr), src0);
421 }
422
423 nr += 2;
424 }
425 target = inst->Sampler >> 1;
426 eot = inst->Sampler & 1;
427 fire_fb_write(c, 0, nr, target, eot);
428 }
429
430 static void emit_pixel_w( struct brw_wm_compile *c,
431 struct prog_instruction *inst)
432 {
433 struct brw_compile *p = &c->func;
434 GLuint mask = inst->DstReg.WriteMask;
435 if (mask & WRITEMASK_W) {
436 struct brw_reg dst, src0, delta0, delta1;
437 struct brw_reg interp3;
438
439 dst = get_dst_reg(c, inst, 3, 1);
440 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
441 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
442 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
443
444 interp3 = brw_vec1_grf(src0.nr+1, 4);
445 /* Calc 1/w - just linterp wpos[3] optimized by putting the
446 * result straight into a message reg.
447 */
448 brw_LINE(p, brw_null_reg(), interp3, delta0);
449 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
450
451 /* Calc w */
452 brw_math_16( p, dst,
453 BRW_MATH_FUNCTION_INV,
454 BRW_MATH_SATURATE_NONE,
455 2, brw_null_reg(),
456 BRW_MATH_PRECISION_FULL);
457 }
458 }
459
460 static void emit_linterp(struct brw_wm_compile *c,
461 struct prog_instruction *inst)
462 {
463 struct brw_compile *p = &c->func;
464 GLuint mask = inst->DstReg.WriteMask;
465 struct brw_reg interp[4];
466 struct brw_reg dst, delta0, delta1;
467 struct brw_reg src0;
468
469 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
470 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
471 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
472 GLuint nr = src0.nr;
473 int i;
474
475 interp[0] = brw_vec1_grf(nr, 0);
476 interp[1] = brw_vec1_grf(nr, 4);
477 interp[2] = brw_vec1_grf(nr+1, 0);
478 interp[3] = brw_vec1_grf(nr+1, 4);
479
480 for(i = 0; i < 4; i++ ) {
481 if (mask & (1<<i)) {
482 dst = get_dst_reg(c, inst, i, 1);
483 brw_LINE(p, brw_null_reg(), interp[i], delta0);
484 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
485 }
486 }
487 }
488
489 static void emit_cinterp(struct brw_wm_compile *c,
490 struct prog_instruction *inst)
491 {
492 struct brw_compile *p = &c->func;
493 GLuint mask = inst->DstReg.WriteMask;
494
495 struct brw_reg interp[4];
496 struct brw_reg dst, src0;
497
498 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
499 GLuint nr = src0.nr;
500 int i;
501
502 interp[0] = brw_vec1_grf(nr, 0);
503 interp[1] = brw_vec1_grf(nr, 4);
504 interp[2] = brw_vec1_grf(nr+1, 0);
505 interp[3] = brw_vec1_grf(nr+1, 4);
506
507 for(i = 0; i < 4; i++ ) {
508 if (mask & (1<<i)) {
509 dst = get_dst_reg(c, inst, i, 1);
510 brw_MOV(p, dst, suboffset(interp[i],3));
511 }
512 }
513 }
514
515 static void emit_pinterp(struct brw_wm_compile *c,
516 struct prog_instruction *inst)
517 {
518 struct brw_compile *p = &c->func;
519 GLuint mask = inst->DstReg.WriteMask;
520
521 struct brw_reg interp[4];
522 struct brw_reg dst, delta0, delta1;
523 struct brw_reg src0, w;
524
525 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
526 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
527 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
528 w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
529 GLuint nr = src0.nr;
530 int i;
531
532 interp[0] = brw_vec1_grf(nr, 0);
533 interp[1] = brw_vec1_grf(nr, 4);
534 interp[2] = brw_vec1_grf(nr+1, 0);
535 interp[3] = brw_vec1_grf(nr+1, 4);
536
537 for(i = 0; i < 4; i++ ) {
538 if (mask & (1<<i)) {
539 dst = get_dst_reg(c, inst, i, 1);
540 brw_LINE(p, brw_null_reg(), interp[i], delta0);
541 brw_MAC(p, dst, suboffset(interp[i],1),
542 delta1);
543 brw_MUL(p, dst, dst, w);
544 }
545 }
546 }
547
548 static void emit_xpd(struct brw_wm_compile *c,
549 struct prog_instruction *inst)
550 {
551 int i;
552 struct brw_compile *p = &c->func;
553 GLuint mask = inst->DstReg.WriteMask;
554 for (i = 0; i < 4; i++) {
555 GLuint i2 = (i+2)%3;
556 GLuint i1 = (i+1)%3;
557 if (mask & (1<<i)) {
558 struct brw_reg src0, src1, dst;
559 dst = get_dst_reg(c, inst, i, 1);
560 src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
561 src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
562 brw_MUL(p, brw_null_reg(), src0, src1);
563 src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
564 src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
565 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
566 brw_MAC(p, dst, src0, src1);
567 brw_set_saturate(p, 0);
568 }
569 }
570 brw_set_saturate(p, 0);
571 }
572
573 static void emit_dp3(struct brw_wm_compile *c,
574 struct prog_instruction *inst)
575 {
576 struct brw_reg src0[3], src1[3], dst;
577 int i;
578 struct brw_compile *p = &c->func;
579 for (i = 0; i < 3; i++) {
580 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
581 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
582 }
583
584 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
585 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
586 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
587 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
588 brw_MAC(p, dst, src0[2], src1[2]);
589 brw_set_saturate(p, 0);
590 }
591
592 static void emit_dp4(struct brw_wm_compile *c,
593 struct prog_instruction *inst)
594 {
595 struct brw_reg src0[4], src1[4], dst;
596 int i;
597 struct brw_compile *p = &c->func;
598 for (i = 0; i < 4; i++) {
599 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
600 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
601 }
602 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
603 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
604 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
605 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
606 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
607 brw_MAC(p, dst, src0[3], src1[3]);
608 brw_set_saturate(p, 0);
609 }
610
611 static void emit_dph(struct brw_wm_compile *c,
612 struct prog_instruction *inst)
613 {
614 struct brw_reg src0[4], src1[4], dst;
615 int i;
616 struct brw_compile *p = &c->func;
617 for (i = 0; i < 4; i++) {
618 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
619 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
620 }
621 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
622 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
623 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
624 brw_MAC(p, dst, src0[2], src1[2]);
625 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
626 brw_ADD(p, dst, dst, src1[3]);
627 brw_set_saturate(p, 0);
628 }
629
630 static void emit_math1(struct brw_wm_compile *c,
631 struct prog_instruction *inst, GLuint func)
632 {
633 struct brw_compile *p = &c->func;
634 struct brw_reg src0, dst;
635
636 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
637 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
638 brw_MOV(p, brw_message_reg(2), src0);
639 brw_math(p,
640 dst,
641 func,
642 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
643 2,
644 brw_null_reg(),
645 BRW_MATH_DATA_VECTOR,
646 BRW_MATH_PRECISION_FULL);
647 }
648
649 static void emit_rcp(struct brw_wm_compile *c,
650 struct prog_instruction *inst)
651 {
652 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
653 }
654
655 static void emit_rsq(struct brw_wm_compile *c,
656 struct prog_instruction *inst)
657 {
658 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
659 }
660
661 static void emit_sin(struct brw_wm_compile *c,
662 struct prog_instruction *inst)
663 {
664 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
665 }
666
667 static void emit_cos(struct brw_wm_compile *c,
668 struct prog_instruction *inst)
669 {
670 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
671 }
672
673 static void emit_ex2(struct brw_wm_compile *c,
674 struct prog_instruction *inst)
675 {
676 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
677 }
678
679 static void emit_lg2(struct brw_wm_compile *c,
680 struct prog_instruction *inst)
681 {
682 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
683 }
684
685 static void emit_add(struct brw_wm_compile *c,
686 struct prog_instruction *inst)
687 {
688 struct brw_compile *p = &c->func;
689 struct brw_reg src0, src1, dst;
690 GLuint mask = inst->DstReg.WriteMask;
691 int i;
692 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
693 for (i = 0 ; i < 4; i++) {
694 if (mask & (1<<i)) {
695 dst = get_dst_reg(c, inst, i, 1);
696 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
697 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
698 brw_ADD(p, dst, src0, src1);
699 }
700 }
701 brw_set_saturate(p, 0);
702 }
703
704 static void emit_sub(struct brw_wm_compile *c,
705 struct prog_instruction *inst)
706 {
707 struct brw_compile *p = &c->func;
708 struct brw_reg src0, src1, dst;
709 GLuint mask = inst->DstReg.WriteMask;
710 int i;
711 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
712 for (i = 0 ; i < 4; i++) {
713 if (mask & (1<<i)) {
714 dst = get_dst_reg(c, inst, i, 1);
715 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
716 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
717 brw_ADD(p, dst, src0, negate(src1));
718 }
719 }
720 brw_set_saturate(p, 0);
721 }
722
723 static void emit_mul(struct brw_wm_compile *c,
724 struct prog_instruction *inst)
725 {
726 struct brw_compile *p = &c->func;
727 struct brw_reg src0, src1, dst;
728 GLuint mask = inst->DstReg.WriteMask;
729 int i;
730 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
731 for (i = 0 ; i < 4; i++) {
732 if (mask & (1<<i)) {
733 dst = get_dst_reg(c, inst, i, 1);
734 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
735 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
736 brw_MUL(p, dst, src0, src1);
737 }
738 }
739 brw_set_saturate(p, 0);
740 }
741
742 static void emit_frc(struct brw_wm_compile *c,
743 struct prog_instruction *inst)
744 {
745 struct brw_compile *p = &c->func;
746 struct brw_reg src0, dst;
747 GLuint mask = inst->DstReg.WriteMask;
748 int i;
749 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
750 for (i = 0 ; i < 4; i++) {
751 if (mask & (1<<i)) {
752 dst = get_dst_reg(c, inst, i, 1);
753 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
754 brw_FRC(p, dst, src0);
755 }
756 }
757 if (inst->SaturateMode != SATURATE_OFF)
758 brw_set_saturate(p, 0);
759 }
760
761 static void emit_flr(struct brw_wm_compile *c,
762 struct prog_instruction *inst)
763 {
764 struct brw_compile *p = &c->func;
765 struct brw_reg src0, dst;
766 GLuint mask = inst->DstReg.WriteMask;
767 int i;
768 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
769 for (i = 0 ; i < 4; i++) {
770 if (mask & (1<<i)) {
771 dst = get_dst_reg(c, inst, i, 1);
772 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
773 brw_RNDD(p, dst, src0);
774 }
775 }
776 brw_set_saturate(p, 0);
777 }
778
779 static void emit_max(struct brw_wm_compile *c,
780 struct prog_instruction *inst)
781 {
782 struct brw_compile *p = &c->func;
783 GLuint mask = inst->DstReg.WriteMask;
784 struct brw_reg src0, src1, dst;
785 int i;
786 brw_push_insn_state(p);
787 for (i = 0; i < 4; i++) {
788 if (mask & (1<<i)) {
789 dst = get_dst_reg(c, inst, i, 1);
790 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
791 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
792 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
793 brw_MOV(p, dst, src0);
794 brw_set_saturate(p, 0);
795
796 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
797 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
798 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
799 brw_MOV(p, dst, src1);
800 brw_set_saturate(p, 0);
801 brw_set_predicate_control_flag_value(p, 0xff);
802 }
803 }
804 brw_pop_insn_state(p);
805 }
806
807 static void emit_min(struct brw_wm_compile *c,
808 struct prog_instruction *inst)
809 {
810 struct brw_compile *p = &c->func;
811 GLuint mask = inst->DstReg.WriteMask;
812 struct brw_reg src0, src1, dst;
813 int i;
814 brw_push_insn_state(p);
815 for (i = 0; i < 4; i++) {
816 if (mask & (1<<i)) {
817 dst = get_dst_reg(c, inst, i, 1);
818 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
819 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
820 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
821 brw_MOV(p, dst, src0);
822 brw_set_saturate(p, 0);
823
824 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
825 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
826 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
827 brw_MOV(p, dst, src1);
828 brw_set_saturate(p, 0);
829 brw_set_predicate_control_flag_value(p, 0xff);
830 }
831 }
832 brw_pop_insn_state(p);
833 }
834
835 static void emit_pow(struct brw_wm_compile *c,
836 struct prog_instruction *inst)
837 {
838 struct brw_compile *p = &c->func;
839 struct brw_reg dst, src0, src1;
840 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
841 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
842 src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
843
844 brw_MOV(p, brw_message_reg(2), src0);
845 brw_MOV(p, brw_message_reg(3), src1);
846
847 brw_math(p,
848 dst,
849 BRW_MATH_FUNCTION_POW,
850 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
851 2,
852 brw_null_reg(),
853 BRW_MATH_DATA_VECTOR,
854 BRW_MATH_PRECISION_FULL);
855 }
856
857 static void emit_lrp(struct brw_wm_compile *c,
858 struct prog_instruction *inst)
859 {
860 struct brw_compile *p = &c->func;
861 GLuint mask = inst->DstReg.WriteMask;
862 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
863 int i;
864 int mark = mark_tmps(c);
865 for (i = 0; i < 4; i++) {
866 if (mask & (1<<i)) {
867 dst = get_dst_reg(c, inst, i, 1);
868 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
869
870 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
871
872 if (src1.nr == dst.nr) {
873 tmp1 = alloc_tmp(c);
874 brw_MOV(p, tmp1, src1);
875 } else
876 tmp1 = src1;
877
878 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
879 if (src2.nr == dst.nr) {
880 tmp2 = alloc_tmp(c);
881 brw_MOV(p, tmp2, src2);
882 } else
883 tmp2 = src2;
884
885 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
886 brw_MUL(p, brw_null_reg(), dst, tmp2);
887 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
888 brw_MAC(p, dst, src0, tmp1);
889 brw_set_saturate(p, 0);
890 }
891 release_tmps(c, mark);
892 }
893 }
894
895 /**
896 * For GLSL shaders, this KIL will be unconditional.
897 * It may be contained inside an IF/ENDIF structure of course.
898 */
899 static void emit_kil(struct brw_wm_compile *c)
900 {
901 struct brw_compile *p = &c->func;
902 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
903 brw_push_insn_state(p);
904 brw_set_mask_control(p, BRW_MASK_DISABLE);
905 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
906 brw_AND(p, depth, c->emit_mask_reg, depth);
907 brw_pop_insn_state(p);
908 }
909
910 static void emit_mad(struct brw_wm_compile *c,
911 struct prog_instruction *inst)
912 {
913 struct brw_compile *p = &c->func;
914 GLuint mask = inst->DstReg.WriteMask;
915 struct brw_reg dst, src0, src1, src2;
916 int i;
917
918 for (i = 0; i < 4; i++) {
919 if (mask & (1<<i)) {
920 dst = get_dst_reg(c, inst, i, 1);
921 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
922 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
923 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
924 brw_MUL(p, dst, src0, src1);
925
926 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
927 brw_ADD(p, dst, dst, src2);
928 brw_set_saturate(p, 0);
929 }
930 }
931 }
932
933 static void emit_sop(struct brw_wm_compile *c,
934 struct prog_instruction *inst, GLuint cond)
935 {
936 struct brw_compile *p = &c->func;
937 GLuint mask = inst->DstReg.WriteMask;
938 struct brw_reg dst, src0, src1;
939 int i;
940
941 for (i = 0; i < 4; i++) {
942 if (mask & (1<<i)) {
943 dst = get_dst_reg(c, inst, i, 1);
944 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
945 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
946 brw_push_insn_state(p);
947 brw_CMP(p, brw_null_reg(), cond, src0, src1);
948 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
949 brw_MOV(p, dst, brw_imm_f(0.0));
950 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
951 brw_MOV(p, dst, brw_imm_f(1.0));
952 brw_pop_insn_state(p);
953 }
954 }
955 }
956
957 static void emit_slt(struct brw_wm_compile *c,
958 struct prog_instruction *inst)
959 {
960 emit_sop(c, inst, BRW_CONDITIONAL_L);
961 }
962
963 static void emit_sle(struct brw_wm_compile *c,
964 struct prog_instruction *inst)
965 {
966 emit_sop(c, inst, BRW_CONDITIONAL_LE);
967 }
968
969 static void emit_sgt(struct brw_wm_compile *c,
970 struct prog_instruction *inst)
971 {
972 emit_sop(c, inst, BRW_CONDITIONAL_G);
973 }
974
975 static void emit_sge(struct brw_wm_compile *c,
976 struct prog_instruction *inst)
977 {
978 emit_sop(c, inst, BRW_CONDITIONAL_GE);
979 }
980
981 static void emit_seq(struct brw_wm_compile *c,
982 struct prog_instruction *inst)
983 {
984 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
985 }
986
987 static void emit_sne(struct brw_wm_compile *c,
988 struct prog_instruction *inst)
989 {
990 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
991 }
992
993 static void emit_ddx(struct brw_wm_compile *c,
994 struct prog_instruction *inst)
995 {
996 struct brw_compile *p = &c->func;
997 GLuint mask = inst->DstReg.WriteMask;
998 struct brw_reg interp[4];
999 struct brw_reg dst;
1000 struct brw_reg src0, w;
1001 GLuint nr, i;
1002 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1003 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1004 nr = src0.nr;
1005 interp[0] = brw_vec1_grf(nr, 0);
1006 interp[1] = brw_vec1_grf(nr, 4);
1007 interp[2] = brw_vec1_grf(nr+1, 0);
1008 interp[3] = brw_vec1_grf(nr+1, 4);
1009 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1010 for(i = 0; i < 4; i++ ) {
1011 if (mask & (1<<i)) {
1012 dst = get_dst_reg(c, inst, i, 1);
1013 brw_MOV(p, dst, interp[i]);
1014 brw_MUL(p, dst, dst, w);
1015 }
1016 }
1017 brw_set_saturate(p, 0);
1018 }
1019
1020 static void emit_ddy(struct brw_wm_compile *c,
1021 struct prog_instruction *inst)
1022 {
1023 struct brw_compile *p = &c->func;
1024 GLuint mask = inst->DstReg.WriteMask;
1025 struct brw_reg interp[4];
1026 struct brw_reg dst;
1027 struct brw_reg src0, w;
1028 GLuint nr, i;
1029
1030 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1031 nr = src0.nr;
1032 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1033 interp[0] = brw_vec1_grf(nr, 0);
1034 interp[1] = brw_vec1_grf(nr, 4);
1035 interp[2] = brw_vec1_grf(nr+1, 0);
1036 interp[3] = brw_vec1_grf(nr+1, 4);
1037 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1038 for(i = 0; i < 4; i++ ) {
1039 if (mask & (1<<i)) {
1040 dst = get_dst_reg(c, inst, i, 1);
1041 brw_MOV(p, dst, suboffset(interp[i], 1));
1042 brw_MUL(p, dst, dst, w);
1043 }
1044 }
1045 brw_set_saturate(p, 0);
1046 }
1047
1048 static __inline struct brw_reg high_words( struct brw_reg reg )
1049 {
1050 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1051 0, 8, 2 );
1052 }
1053
1054 static __inline struct brw_reg low_words( struct brw_reg reg )
1055 {
1056 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1057 }
1058
1059 static __inline struct brw_reg even_bytes( struct brw_reg reg )
1060 {
1061 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1062 }
1063
1064 static __inline struct brw_reg odd_bytes( struct brw_reg reg )
1065 {
1066 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1067 0, 16, 2 );
1068 }
1069
1070 /* One-, two- and three-dimensional Perlin noise, similar to the description
1071 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1072 static void noise1_sub( struct brw_wm_compile *c ) {
1073
1074 struct brw_compile *p = &c->func;
1075 struct brw_reg param,
1076 x0, x1, /* gradients at each end */
1077 t, tmp[ 2 ], /* float temporaries */
1078 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1079 int i;
1080 int mark = mark_tmps( c );
1081
1082 x0 = alloc_tmp( c );
1083 x1 = alloc_tmp( c );
1084 t = alloc_tmp( c );
1085 tmp[ 0 ] = alloc_tmp( c );
1086 tmp[ 1 ] = alloc_tmp( c );
1087 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1088 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1089 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1090 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1091 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1092
1093 param = lookup_tmp( c, mark - 2 );
1094
1095 brw_set_access_mode( p, BRW_ALIGN_1 );
1096
1097 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1098
1099 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1100 be hashed. Also compute the remainder (offset within the unit
1101 length), interleaved to reduce register dependency penalties. */
1102 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1103 brw_FRC( p, param, param );
1104 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1105 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1106 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1107
1108 /* We're now ready to perform the hashing. The two hashes are
1109 interleaved for performance. The hash function used is
1110 designed to rapidly achieve avalanche and require only 32x16
1111 bit multiplication, and 16-bit swizzles (which we get for
1112 free). We can't use immediate operands in the multiplies,
1113 because immediates are permitted only in src1 and the 16-bit
1114 factor is permitted only in src0. */
1115 for( i = 0; i < 2; i++ )
1116 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1117 for( i = 0; i < 2; i++ )
1118 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1119 high_words( itmp[ i ] ) );
1120 for( i = 0; i < 2; i++ )
1121 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1122 for( i = 0; i < 2; i++ )
1123 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1124 high_words( itmp[ i ] ) );
1125 for( i = 0; i < 2; i++ )
1126 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1127 for( i = 0; i < 2; i++ )
1128 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1129 high_words( itmp[ i ] ) );
1130
1131 /* Now we want to initialise the two gradients based on the
1132 hashes. Format conversion from signed integer to float leaves
1133 everything scaled too high by a factor of pow( 2, 31 ), but
1134 we correct for that right at the end. */
1135 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1136 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1137 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1138
1139 brw_MUL( p, x0, x0, param );
1140 brw_MUL( p, x1, x1, t );
1141
1142 /* We interpolate between the gradients using the polynomial
1143 6t^5 - 15t^4 + 10t^3 (Perlin). */
1144 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1145 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1146 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1147 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1148 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1149 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1150 pipeline */
1151 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1152 brw_MUL( p, param, tmp[ 0 ], param );
1153 brw_MUL( p, x1, x1, param );
1154 brw_ADD( p, x0, x0, x1 );
1155 /* scale by pow( 2, -30 ), to compensate for the format conversion
1156 above and an extra factor of 2 so that a single gradient covers
1157 the [-1,1] range */
1158 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1159
1160 release_tmps( c, mark );
1161 }
1162
1163 static void emit_noise1( struct brw_wm_compile *c,
1164 struct prog_instruction *inst )
1165 {
1166 struct brw_compile *p = &c->func;
1167 struct brw_reg src, param, dst;
1168 GLuint mask = inst->DstReg.WriteMask;
1169 int i;
1170 int mark = mark_tmps( c );
1171
1172 assert( mark == 0 );
1173
1174 src = get_src_reg( c, inst->SrcReg, 0, 1 );
1175
1176 param = alloc_tmp( c );
1177
1178 brw_MOV( p, param, src );
1179
1180 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1181
1182 /* Fill in the result: */
1183 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1184 for (i = 0 ; i < 4; i++) {
1185 if (mask & (1<<i)) {
1186 dst = get_dst_reg(c, inst, i, 1);
1187 brw_MOV( p, dst, param );
1188 }
1189 }
1190 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1191 brw_set_saturate( p, 0 );
1192
1193 release_tmps( c, mark );
1194 }
1195
1196 static void noise2_sub( struct brw_wm_compile *c ) {
1197
1198 struct brw_compile *p = &c->func;
1199 struct brw_reg param0, param1,
1200 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1201 t, tmp[ 4 ], /* float temporaries */
1202 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1203 int i;
1204 int mark = mark_tmps( c );
1205
1206 x0y0 = alloc_tmp( c );
1207 x0y1 = alloc_tmp( c );
1208 x1y0 = alloc_tmp( c );
1209 x1y1 = alloc_tmp( c );
1210 t = alloc_tmp( c );
1211 for( i = 0; i < 4; i++ ) {
1212 tmp[ i ] = alloc_tmp( c );
1213 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1214 }
1215 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1216 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1217 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1218
1219 param0 = lookup_tmp( c, mark - 3 );
1220 param1 = lookup_tmp( c, mark - 2 );
1221
1222 brw_set_access_mode( p, BRW_ALIGN_1 );
1223
1224 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1225 be hashed. Also compute the remainders (offsets within the unit
1226 square), interleaved to reduce register dependency penalties. */
1227 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1228 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1229 brw_FRC( p, param0, param0 );
1230 brw_FRC( p, param1, param1 );
1231 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1232 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1233 low_words( itmp[ 1 ] ) );
1234 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1235 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1236 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1237 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1238 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1239
1240 /* We're now ready to perform the hashing. The four hashes are
1241 interleaved for performance. The hash function used is
1242 designed to rapidly achieve avalanche and require only 32x16
1243 bit multiplication, and 16-bit swizzles (which we get for
1244 free). We can't use immediate operands in the multiplies,
1245 because immediates are permitted only in src1 and the 16-bit
1246 factor is permitted only in src0. */
1247 for( i = 0; i < 4; i++ )
1248 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1249 for( i = 0; i < 4; i++ )
1250 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1251 high_words( itmp[ i ] ) );
1252 for( i = 0; i < 4; i++ )
1253 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1254 for( i = 0; i < 4; i++ )
1255 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1256 high_words( itmp[ i ] ) );
1257 for( i = 0; i < 4; i++ )
1258 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1259 for( i = 0; i < 4; i++ )
1260 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1261 high_words( itmp[ i ] ) );
1262
1263 /* Now we want to initialise the four gradients based on the
1264 hashes. Format conversion from signed integer to float leaves
1265 everything scaled too high by a factor of pow( 2, 15 ), but
1266 we correct for that right at the end. */
1267 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1268 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1269 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1270 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1271 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1272
1273 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1274 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1275 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1276 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1277
1278 brw_MUL( p, x1y0, x1y0, t );
1279 brw_MUL( p, x1y1, x1y1, t );
1280 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1281 brw_MUL( p, x0y0, x0y0, param0 );
1282 brw_MUL( p, x0y1, x0y1, param0 );
1283
1284 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1285 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1286 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1287 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1288
1289 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1290 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1291 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1292 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1293
1294 /* We interpolate between the gradients using the polynomial
1295 6t^5 - 15t^4 + 10t^3 (Perlin). */
1296 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1297 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1298 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1299 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1300 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1301 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1302 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1303 pipeline */
1304 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1305 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1306 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1307 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1308 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1309 pipeline */
1310 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1311 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1312 brw_MUL( p, param0, tmp[ 0 ], param0 );
1313 brw_MUL( p, param1, tmp[ 1 ], param1 );
1314
1315 /* Here we interpolate in the y dimension... */
1316 brw_MUL( p, x0y1, x0y1, param1 );
1317 brw_MUL( p, x1y1, x1y1, param1 );
1318 brw_ADD( p, x0y0, x0y0, x0y1 );
1319 brw_ADD( p, x1y0, x1y0, x1y1 );
1320
1321 /* And now in x. There are horrible register dependencies here,
1322 but we have nothing else to do. */
1323 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1324 brw_MUL( p, x1y0, x1y0, param0 );
1325 brw_ADD( p, x0y0, x0y0, x1y0 );
1326
1327 /* scale by pow( 2, -15 ), as described above */
1328 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1329
1330 release_tmps( c, mark );
1331 }
1332
1333 static void emit_noise2( struct brw_wm_compile *c,
1334 struct prog_instruction *inst )
1335 {
1336 struct brw_compile *p = &c->func;
1337 struct brw_reg src0, src1, param0, param1, dst;
1338 GLuint mask = inst->DstReg.WriteMask;
1339 int i;
1340 int mark = mark_tmps( c );
1341
1342 assert( mark == 0 );
1343
1344 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1345 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1346
1347 param0 = alloc_tmp( c );
1348 param1 = alloc_tmp( c );
1349
1350 brw_MOV( p, param0, src0 );
1351 brw_MOV( p, param1, src1 );
1352
1353 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1354
1355 /* Fill in the result: */
1356 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1357 for (i = 0 ; i < 4; i++) {
1358 if (mask & (1<<i)) {
1359 dst = get_dst_reg(c, inst, i, 1);
1360 brw_MOV( p, dst, param0 );
1361 }
1362 }
1363 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1364 brw_set_saturate( p, 0 );
1365
1366 release_tmps( c, mark );
1367 }
1368
1369 /* The three-dimensional case is much like the one- and two- versions above,
1370 but since the number of corners is rapidly growing we now pack 16 16-bit
1371 hashes into each register to extract more parallelism from the EUs. */
1372 static void noise3_sub( struct brw_wm_compile *c ) {
1373
1374 struct brw_compile *p = &c->func;
1375 struct brw_reg param0, param1, param2,
1376 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1377 xi, yi, zi, /* interpolation coefficients */
1378 t, tmp[ 8 ], /* float temporaries */
1379 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1380 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1381 int i;
1382 int mark = mark_tmps( c );
1383
1384 x0y0 = alloc_tmp( c );
1385 x0y1 = alloc_tmp( c );
1386 x1y0 = alloc_tmp( c );
1387 x1y1 = alloc_tmp( c );
1388 xi = alloc_tmp( c );
1389 yi = alloc_tmp( c );
1390 zi = alloc_tmp( c );
1391 t = alloc_tmp( c );
1392 for( i = 0; i < 8; i++ ) {
1393 tmp[ i ] = alloc_tmp( c );
1394 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1395 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1396 }
1397
1398 param0 = lookup_tmp( c, mark - 4 );
1399 param1 = lookup_tmp( c, mark - 3 );
1400 param2 = lookup_tmp( c, mark - 2 );
1401
1402 brw_set_access_mode( p, BRW_ALIGN_1 );
1403
1404 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1405 be hashed. Also compute the remainders (offsets within the unit
1406 cube), interleaved to reduce register dependency penalties. */
1407 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1408 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1409 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1410 brw_FRC( p, param0, param0 );
1411 brw_FRC( p, param1, param1 );
1412 brw_FRC( p, param2, param2 );
1413 /* Since we now have only 16 bits of precision in the hash, we must
1414 be more careful about thorough mixing to maintain entropy as we
1415 squash the input vector into a small scalar. */
1416 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1417 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1418 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1419 brw_imm_uw( 0x9B93 ) );
1420 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1421 brw_imm_uw( 0xBC8F ) );
1422
1423 /* Temporarily disable the execution mask while we work with ExecSize=16
1424 channels (the mask is set for ExecSize=8 and is probably incorrect).
1425 Although this might cause execution of unwanted channels, the code
1426 writes only to temporary registers and has no side effects, so
1427 disabling the mask is harmless. */
1428 brw_push_insn_state( p );
1429 brw_set_mask_control( p, BRW_MASK_DISABLE );
1430 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1431 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1432 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1433
1434 /* We're now ready to perform the hashing. The eight hashes are
1435 interleaved for performance. The hash function used is
1436 designed to rapidly achieve avalanche and require only 16x16
1437 bit multiplication, and 8-bit swizzles (which we get for
1438 free). */
1439 for( i = 0; i < 4; i++ )
1440 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1441 for( i = 0; i < 4; i++ )
1442 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1443 odd_bytes( wtmp[ i ] ) );
1444 for( i = 0; i < 4; i++ )
1445 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1446 for( i = 0; i < 4; i++ )
1447 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1448 odd_bytes( wtmp[ i ] ) );
1449 brw_pop_insn_state( p );
1450
1451 /* Now we want to initialise the four rear gradients based on the
1452 hashes. Format conversion from signed integer to float leaves
1453 everything scaled too high by a factor of pow( 2, 15 ), but
1454 we correct for that right at the end. */
1455 /* x component */
1456 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1457 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1458 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1459 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1460 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1461
1462 brw_push_insn_state( p );
1463 brw_set_mask_control( p, BRW_MASK_DISABLE );
1464 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1465 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1466 brw_pop_insn_state( p );
1467
1468 brw_MUL( p, x1y0, x1y0, t );
1469 brw_MUL( p, x1y1, x1y1, t );
1470 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1471 brw_MUL( p, x0y0, x0y0, param0 );
1472 brw_MUL( p, x0y1, x0y1, param0 );
1473
1474 /* y component */
1475 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1476 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1477 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1478 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1479
1480 brw_push_insn_state( p );
1481 brw_set_mask_control( p, BRW_MASK_DISABLE );
1482 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1483 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1484 brw_pop_insn_state( p );
1485
1486 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1487 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1488 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1489 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1490 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1491
1492 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1493 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1494 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1495 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1496
1497 /* z component */
1498 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1499 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1500 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1501 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1502
1503 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1504 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1505 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1506 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1507
1508 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1509 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1510 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1511 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1512
1513 /* We interpolate between the gradients using the polynomial
1514 6t^5 - 15t^4 + 10t^3 (Perlin). */
1515 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1516 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1517 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1518 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1519 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1520 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1521 brw_MUL( p, xi, xi, param0 );
1522 brw_MUL( p, yi, yi, param1 );
1523 brw_MUL( p, zi, zi, param2 );
1524 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1525 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1526 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1527 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1528 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1529 brw_MUL( p, xi, xi, param0 );
1530 brw_MUL( p, yi, yi, param1 );
1531 brw_MUL( p, zi, zi, param2 );
1532 brw_MUL( p, xi, xi, param0 );
1533 brw_MUL( p, yi, yi, param1 );
1534 brw_MUL( p, zi, zi, param2 );
1535 brw_MUL( p, xi, xi, param0 );
1536 brw_MUL( p, yi, yi, param1 );
1537 brw_MUL( p, zi, zi, param2 );
1538
1539 /* Here we interpolate in the y dimension... */
1540 brw_MUL( p, x0y1, x0y1, yi );
1541 brw_MUL( p, x1y1, x1y1, yi );
1542 brw_ADD( p, x0y0, x0y0, x0y1 );
1543 brw_ADD( p, x1y0, x1y0, x1y1 );
1544
1545 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1546 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1547 brw_MUL( p, x1y0, x1y0, xi );
1548 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1549
1550 /* Now do the same thing for the front four gradients... */
1551 /* x component */
1552 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1553 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1554 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1555 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1556
1557 brw_push_insn_state( p );
1558 brw_set_mask_control( p, BRW_MASK_DISABLE );
1559 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1560 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1561 brw_pop_insn_state( p );
1562
1563 brw_MUL( p, x1y0, x1y0, t );
1564 brw_MUL( p, x1y1, x1y1, t );
1565 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1566 brw_MUL( p, x0y0, x0y0, param0 );
1567 brw_MUL( p, x0y1, x0y1, param0 );
1568
1569 /* y component */
1570 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1571 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1572 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1573 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1574
1575 brw_push_insn_state( p );
1576 brw_set_mask_control( p, BRW_MASK_DISABLE );
1577 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1578 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1579 brw_pop_insn_state( p );
1580
1581 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1582 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1583 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1584 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1585 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1586
1587 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1588 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1589 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1590 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1591
1592 /* z component */
1593 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1594 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1595 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1596 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1597
1598 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1599 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1600 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1601 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1602
1603 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1604 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1605 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1606 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1607
1608 /* The interpolation coefficients are still around from last time, so
1609 again interpolate in the y dimension... */
1610 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1611 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1612 brw_MUL( p, x0y1, x0y1, yi );
1613 brw_MUL( p, x1y1, x1y1, yi );
1614 brw_ADD( p, x0y0, x0y0, x0y1 );
1615 brw_ADD( p, x1y0, x1y0, x1y1 );
1616
1617 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1618 time put the front face in tmp[ 1 ] and we're nearly there... */
1619 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1620 brw_MUL( p, x1y0, x1y0, xi );
1621 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1622
1623 /* The final interpolation, in the z dimension: */
1624 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1625 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1626 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1627
1628 /* scale by pow( 2, -15 ), as described above */
1629 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1630
1631 release_tmps( c, mark );
1632 }
1633
1634 static void emit_noise3( struct brw_wm_compile *c,
1635 struct prog_instruction *inst )
1636 {
1637 struct brw_compile *p = &c->func;
1638 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1639 GLuint mask = inst->DstReg.WriteMask;
1640 int i;
1641 int mark = mark_tmps( c );
1642
1643 assert( mark == 0 );
1644
1645 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1646 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1647 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
1648
1649 param0 = alloc_tmp( c );
1650 param1 = alloc_tmp( c );
1651 param2 = alloc_tmp( c );
1652
1653 brw_MOV( p, param0, src0 );
1654 brw_MOV( p, param1, src1 );
1655 brw_MOV( p, param2, src2 );
1656
1657 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1658
1659 /* Fill in the result: */
1660 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1661 for (i = 0 ; i < 4; i++) {
1662 if (mask & (1<<i)) {
1663 dst = get_dst_reg(c, inst, i, 1);
1664 brw_MOV( p, dst, param0 );
1665 }
1666 }
1667 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1668 brw_set_saturate( p, 0 );
1669
1670 release_tmps( c, mark );
1671 }
1672
1673 /* For the four-dimensional case, the little micro-optimisation benefits
1674 we obtain by unrolling all the loops aren't worth the massive bloat it
1675 now causes. Instead, we loop twice around performing a similar operation
1676 to noise3, once for the w=0 cube and once for the w=1, with a bit more
1677 code to glue it all together. */
1678 static void noise4_sub( struct brw_wm_compile *c ) {
1679
1680 struct brw_compile *p = &c->func;
1681 struct brw_reg param[ 4 ],
1682 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1683 w0, /* noise for the w=0 cube */
1684 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1685 interp[ 4 ], /* interpolation coefficients */
1686 t, tmp[ 8 ], /* float temporaries */
1687 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1688 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1689 int i, j;
1690 int mark = mark_tmps( c );
1691 GLuint loop, origin;
1692
1693 x0y0 = alloc_tmp( c );
1694 x0y1 = alloc_tmp( c );
1695 x1y0 = alloc_tmp( c );
1696 x1y1 = alloc_tmp( c );
1697 t = alloc_tmp( c );
1698 w0 = alloc_tmp( c );
1699 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1700 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1701
1702 for( i = 0; i < 4; i++ ) {
1703 param[ i ] = lookup_tmp( c, mark - 5 + i );
1704 interp[ i ] = alloc_tmp( c );
1705 }
1706
1707 for( i = 0; i < 8; i++ ) {
1708 tmp[ i ] = alloc_tmp( c );
1709 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1710 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1711 }
1712
1713 brw_set_access_mode( p, BRW_ALIGN_1 );
1714
1715 /* We only want 16 bits of precision from the integral part of each
1716 co-ordinate, but unfortunately the RNDD semantics would saturate
1717 at 16 bits if we performed the operation directly to a 16-bit
1718 destination. Therefore, we round to 32-bit temporaries where
1719 appropriate, and then store only the lower 16 bits. */
1720 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1721 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1722 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1723 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1724 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1725 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1726
1727 /* Modify the flag register here, because the side effect is useful
1728 later (see below). We know for certain that all flags will be
1729 cleared, since the FRC instruction cannot possibly generate
1730 negative results. Even for exceptional inputs (infinities, denormals,
1731 NaNs), the architecture guarantees that the L conditional is false. */
1732 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1733 brw_FRC( p, param[ 0 ], param[ 0 ] );
1734 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1735 for( i = 1; i < 4; i++ )
1736 brw_FRC( p, param[ i ], param[ i ] );
1737
1738 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1739 of all. */
1740 for( i = 0; i < 4; i++ )
1741 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1742 for( i = 0; i < 4; i++ )
1743 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1744 for( i = 0; i < 4; i++ )
1745 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1746 for( i = 0; i < 4; i++ )
1747 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1748 for( j = 0; j < 3; j++ )
1749 for( i = 0; i < 4; i++ )
1750 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1751
1752 /* Mark the current address, as it will be a jump destination. The
1753 following code will be executed twice: first, with the flag
1754 register clear indicating the w=0 case, and second with flags
1755 set for w=1. */
1756 loop = p->nr_insn;
1757
1758 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1759 be hashed. Since we have only 16 bits of precision in the hash, we
1760 must be careful about thorough mixing to maintain entropy as we
1761 squash the input vector into a small scalar. */
1762 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1763 brw_imm_uw( 0xBC8F ) );
1764 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1765 brw_imm_uw( 0xD0BD ) );
1766 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1767 brw_imm_uw( 0x9B93 ) );
1768 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1769 brw_imm_uw( 0xA359 ) );
1770 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1771 brw_imm_uw( 0xBC8F ) );
1772
1773 /* Temporarily disable the execution mask while we work with ExecSize=16
1774 channels (the mask is set for ExecSize=8 and is probably incorrect).
1775 Although this might cause execution of unwanted channels, the code
1776 writes only to temporary registers and has no side effects, so
1777 disabling the mask is harmless. */
1778 brw_push_insn_state( p );
1779 brw_set_mask_control( p, BRW_MASK_DISABLE );
1780 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1781 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1782 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1783
1784 /* We're now ready to perform the hashing. The eight hashes are
1785 interleaved for performance. The hash function used is
1786 designed to rapidly achieve avalanche and require only 16x16
1787 bit multiplication, and 8-bit swizzles (which we get for
1788 free). */
1789 for( i = 0; i < 4; i++ )
1790 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1791 for( i = 0; i < 4; i++ )
1792 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1793 odd_bytes( wtmp[ i ] ) );
1794 for( i = 0; i < 4; i++ )
1795 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1796 for( i = 0; i < 4; i++ )
1797 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1798 odd_bytes( wtmp[ i ] ) );
1799 brw_pop_insn_state( p );
1800
1801 /* Now we want to initialise the four rear gradients based on the
1802 hashes. Format conversion from signed integer to float leaves
1803 everything scaled too high by a factor of pow( 2, 15 ), but
1804 we correct for that right at the end. */
1805 /* x component */
1806 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1807 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1808 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1809 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1810 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1811
1812 brw_push_insn_state( p );
1813 brw_set_mask_control( p, BRW_MASK_DISABLE );
1814 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1815 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1816 brw_pop_insn_state( p );
1817
1818 brw_MUL( p, x1y0, x1y0, t );
1819 brw_MUL( p, x1y1, x1y1, t );
1820 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1821 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1822 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1823
1824 /* y component */
1825 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1826 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1827 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1828 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1829
1830 brw_push_insn_state( p );
1831 brw_set_mask_control( p, BRW_MASK_DISABLE );
1832 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1833 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1834 brw_pop_insn_state( p );
1835
1836 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1837 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1838 /* prepare t for the w component (used below): w the first time through
1839 the loop; w - 1 the second time) */
1840 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1841 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1842 p->current->header.predicate_inverse = 1;
1843 brw_MOV( p, t, param[ 3 ] );
1844 p->current->header.predicate_inverse = 0;
1845 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1846 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1847 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1848
1849 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1850 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1851 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1852 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1853
1854 /* z component */
1855 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1856 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1857 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1858 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1859
1860 brw_push_insn_state( p );
1861 brw_set_mask_control( p, BRW_MASK_DISABLE );
1862 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1863 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1864 brw_pop_insn_state( p );
1865
1866 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1867 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1868 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1869 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1870
1871 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1872 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1873 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1874 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1875
1876 /* w component */
1877 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1878 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1879 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1880 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1881
1882 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1883 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1884 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1885 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1886 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1887
1888 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1889 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1890 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1891 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1892
1893 /* Here we interpolate in the y dimension... */
1894 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1895 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1896 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1897 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1898 brw_ADD( p, x0y0, x0y0, x0y1 );
1899 brw_ADD( p, x1y0, x1y0, x1y1 );
1900
1901 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1902 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1903 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1904 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1905
1906 /* Now do the same thing for the front four gradients... */
1907 /* x component */
1908 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1909 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1910 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1911 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1912
1913 brw_push_insn_state( p );
1914 brw_set_mask_control( p, BRW_MASK_DISABLE );
1915 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1916 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1917 brw_pop_insn_state( p );
1918
1919 brw_MUL( p, x1y0, x1y0, t );
1920 brw_MUL( p, x1y1, x1y1, t );
1921 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1922 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1923 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1924
1925 /* y component */
1926 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1927 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1928 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1929 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1930
1931 brw_push_insn_state( p );
1932 brw_set_mask_control( p, BRW_MASK_DISABLE );
1933 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1934 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1935 brw_pop_insn_state( p );
1936
1937 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1938 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1939 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1940 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1941 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1942
1943 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1944 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1945 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1946 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1947
1948 /* z component */
1949 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1950 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1951 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1952 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1953
1954 brw_push_insn_state( p );
1955 brw_set_mask_control( p, BRW_MASK_DISABLE );
1956 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1957 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1958 brw_pop_insn_state( p );
1959
1960 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1961 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1962 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1963 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1964 /* prepare t for the w component (used below): w the first time through
1965 the loop; w - 1 the second time) */
1966 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1967 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1968 p->current->header.predicate_inverse = 1;
1969 brw_MOV( p, t, param[ 3 ] );
1970 p->current->header.predicate_inverse = 0;
1971 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1972
1973 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1974 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1975 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1976 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1977
1978 /* w component */
1979 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1980 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1981 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1982 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1983
1984 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1985 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1986 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1987 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1988
1989 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1990 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1991 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1992 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1993
1994 /* Interpolate in the y dimension: */
1995 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1996 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1997 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1998 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1999 brw_ADD( p, x0y0, x0y0, x0y1 );
2000 brw_ADD( p, x1y0, x1y0, x1y1 );
2001
2002 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2003 time put the front face in tmp[ 1 ] and we're nearly there... */
2004 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2005 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2006 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2007
2008 /* Another interpolation, in the z dimension: */
2009 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2010 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2011 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2012
2013 /* Exit the loop if we've computed both cubes... */
2014 origin = p->nr_insn;
2015 brw_push_insn_state( p );
2016 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2017 brw_set_mask_control( p, BRW_MASK_DISABLE );
2018 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2019 brw_pop_insn_state( p );
2020
2021 /* Save the result for the w=0 case, and increment the w coordinate: */
2022 brw_MOV( p, w0, tmp[ 0 ] );
2023 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2024 brw_imm_uw( 1 ) );
2025
2026 /* Loop around for the other cube. Explicitly set the flag register
2027 (unfortunately we must spend an extra instruction to do this: we
2028 can't rely on a side effect of the previous MOV or ADD because
2029 conditional modifiers which are normally true might be false in
2030 exceptional circumstances, e.g. given a NaN input; the add to
2031 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2032 brw_push_insn_state( p );
2033 brw_set_mask_control( p, BRW_MASK_DISABLE );
2034 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2035 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2036 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2037 brw_pop_insn_state( p );
2038
2039 /* Patch the previous conditional branch now that we know the
2040 destination address. */
2041 brw_set_src1( p->store + origin,
2042 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2043
2044 /* The very last interpolation. */
2045 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2046 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2047 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2048
2049 /* scale by pow( 2, -15 ), as described above */
2050 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2051
2052 release_tmps( c, mark );
2053 }
2054
2055 static void emit_noise4( struct brw_wm_compile *c,
2056 struct prog_instruction *inst )
2057 {
2058 struct brw_compile *p = &c->func;
2059 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2060 GLuint mask = inst->DstReg.WriteMask;
2061 int i;
2062 int mark = mark_tmps( c );
2063
2064 assert( mark == 0 );
2065
2066 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
2067 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
2068 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
2069 src3 = get_src_reg( c, inst->SrcReg, 3, 1 );
2070
2071 param0 = alloc_tmp( c );
2072 param1 = alloc_tmp( c );
2073 param2 = alloc_tmp( c );
2074 param3 = alloc_tmp( c );
2075
2076 brw_MOV( p, param0, src0 );
2077 brw_MOV( p, param1, src1 );
2078 brw_MOV( p, param2, src2 );
2079 brw_MOV( p, param3, src3 );
2080
2081 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2082
2083 /* Fill in the result: */
2084 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2085 for (i = 0 ; i < 4; i++) {
2086 if (mask & (1<<i)) {
2087 dst = get_dst_reg(c, inst, i, 1);
2088 brw_MOV( p, dst, param0 );
2089 }
2090 }
2091 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2092 brw_set_saturate( p, 0 );
2093
2094 release_tmps( c, mark );
2095 }
2096
2097 static void emit_wpos_xy(struct brw_wm_compile *c,
2098 struct prog_instruction *inst)
2099 {
2100 struct brw_compile *p = &c->func;
2101 GLuint mask = inst->DstReg.WriteMask;
2102 struct brw_reg src0[2], dst[2];
2103
2104 dst[0] = get_dst_reg(c, inst, 0, 1);
2105 dst[1] = get_dst_reg(c, inst, 1, 1);
2106
2107 src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
2108 src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
2109
2110 /* Calculate the pixel offset from window bottom left into destination
2111 * X and Y channels.
2112 */
2113 if (mask & WRITEMASK_X) {
2114 /* X' = X - origin_x */
2115 brw_ADD(p,
2116 dst[0],
2117 retype(src0[0], BRW_REGISTER_TYPE_W),
2118 brw_imm_d(0 - c->key.origin_x));
2119 }
2120
2121 if (mask & WRITEMASK_Y) {
2122 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2123 brw_ADD(p,
2124 dst[1],
2125 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2126 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2127 }
2128 }
2129
2130 /* TODO
2131 BIAS on SIMD8 not workind yet...
2132 */
2133 static void emit_txb(struct brw_wm_compile *c,
2134 struct prog_instruction *inst)
2135 {
2136 struct brw_compile *p = &c->func;
2137 struct brw_reg dst[4], src[4], payload_reg;
2138 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2139
2140 GLuint i;
2141 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2142 for (i = 0; i < 4; i++)
2143 dst[i] = get_dst_reg(c, inst, i, 1);
2144 for (i = 0; i < 4; i++)
2145 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2146
2147 switch (inst->TexSrcTarget) {
2148 case TEXTURE_1D_INDEX:
2149 brw_MOV(p, brw_message_reg(2), src[0]);
2150 brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
2151 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2152 break;
2153 case TEXTURE_2D_INDEX:
2154 case TEXTURE_RECT_INDEX:
2155 brw_MOV(p, brw_message_reg(2), src[0]);
2156 brw_MOV(p, brw_message_reg(3), src[1]);
2157 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2158 break;
2159 default:
2160 brw_MOV(p, brw_message_reg(2), src[0]);
2161 brw_MOV(p, brw_message_reg(3), src[1]);
2162 brw_MOV(p, brw_message_reg(4), src[2]);
2163 break;
2164 }
2165 brw_MOV(p, brw_message_reg(5), src[3]);
2166 brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
2167 brw_SAMPLE(p,
2168 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2169 1,
2170 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2171 unit + MAX_DRAW_BUFFERS, /* surface */
2172 unit, /* sampler */
2173 inst->DstReg.WriteMask,
2174 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
2175 4,
2176 4,
2177 0);
2178 }
2179
2180 static void emit_tex(struct brw_wm_compile *c,
2181 struct prog_instruction *inst)
2182 {
2183 struct brw_compile *p = &c->func;
2184 struct brw_reg dst[4], src[4], payload_reg;
2185 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2186
2187 GLuint msg_len;
2188 GLuint i, nr;
2189 GLuint emit;
2190 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2191
2192 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2193
2194 for (i = 0; i < 4; i++)
2195 dst[i] = get_dst_reg(c, inst, i, 1);
2196 for (i = 0; i < 4; i++)
2197 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2198
2199
2200 switch (inst->TexSrcTarget) {
2201 case TEXTURE_1D_INDEX:
2202 emit = WRITEMASK_X;
2203 nr = 1;
2204 break;
2205 case TEXTURE_2D_INDEX:
2206 case TEXTURE_RECT_INDEX:
2207 emit = WRITEMASK_XY;
2208 nr = 2;
2209 break;
2210 default:
2211 emit = WRITEMASK_XYZ;
2212 nr = 3;
2213 break;
2214 }
2215 msg_len = 1;
2216
2217 for (i = 0; i < nr; i++) {
2218 static const GLuint swz[4] = {0,1,2,2};
2219 if (emit & (1<<i))
2220 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2221 else
2222 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2223 msg_len += 1;
2224 }
2225
2226 if (shadow) {
2227 brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
2228 brw_MOV(p, brw_message_reg(6), src[2]);
2229 }
2230
2231 brw_SAMPLE(p,
2232 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2233 1,
2234 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2235 unit + MAX_DRAW_BUFFERS, /* surface */
2236 unit, /* sampler */
2237 inst->DstReg.WriteMask,
2238 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
2239 4,
2240 shadow ? 6 : 4,
2241 0);
2242
2243 if (shadow)
2244 brw_MOV(p, dst[3], brw_imm_f(1.0));
2245 }
2246
2247 static void post_wm_emit( struct brw_wm_compile *c )
2248 {
2249 GLuint nr_insns = c->fp->program.Base.NumInstructions;
2250 GLuint insn, target_insn;
2251 struct prog_instruction *inst1, *inst2;
2252 struct brw_instruction *brw_inst1, *brw_inst2;
2253 int offset;
2254 for (insn = 0; insn < nr_insns; insn++) {
2255 inst1 = &c->fp->program.Base.Instructions[insn];
2256 brw_inst1 = inst1->Data;
2257 switch (inst1->Opcode) {
2258 case OPCODE_CAL:
2259 target_insn = inst1->BranchTarget;
2260 inst2 = &c->fp->program.Base.Instructions[target_insn];
2261 brw_inst2 = inst2->Data;
2262 offset = brw_inst2 - brw_inst1;
2263 brw_set_src1(brw_inst1, brw_imm_d(offset*16));
2264 break;
2265 default:
2266 break;
2267 }
2268 }
2269 }
2270
2271 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2272 {
2273 #define MAX_IFSN 32
2274 #define MAX_LOOP_DEPTH 32
2275 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2276 struct brw_instruction *inst0, *inst1;
2277 int i, if_insn = 0, loop_insn = 0;
2278 struct brw_compile *p = &c->func;
2279 struct brw_indirect stack_index = brw_indirect(0, 0);
2280
2281 c->reg_index = 0;
2282 prealloc_reg(c);
2283 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2284 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2285
2286 for (i = 0; i < c->nr_fp_insns; i++) {
2287 struct prog_instruction *inst = &c->prog_instructions[i];
2288 struct prog_instruction *orig_inst;
2289
2290 if ((orig_inst = inst->Data) != 0)
2291 orig_inst->Data = current_insn(p);
2292
2293 if (inst->CondUpdate)
2294 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2295 else
2296 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2297
2298 switch (inst->Opcode) {
2299 case WM_PIXELXY:
2300 emit_pixel_xy(c, inst);
2301 break;
2302 case WM_DELTAXY:
2303 emit_delta_xy(c, inst);
2304 break;
2305 case WM_PIXELW:
2306 emit_pixel_w(c, inst);
2307 break;
2308 case WM_LINTERP:
2309 emit_linterp(c, inst);
2310 break;
2311 case WM_PINTERP:
2312 emit_pinterp(c, inst);
2313 break;
2314 case WM_CINTERP:
2315 emit_cinterp(c, inst);
2316 break;
2317 case WM_WPOSXY:
2318 emit_wpos_xy(c, inst);
2319 break;
2320 case WM_FB_WRITE:
2321 emit_fb_write(c, inst);
2322 break;
2323 case OPCODE_ABS:
2324 emit_abs(c, inst);
2325 break;
2326 case OPCODE_ADD:
2327 emit_add(c, inst);
2328 break;
2329 case OPCODE_SUB:
2330 emit_sub(c, inst);
2331 break;
2332 case OPCODE_FRC:
2333 emit_frc(c, inst);
2334 break;
2335 case OPCODE_FLR:
2336 emit_flr(c, inst);
2337 break;
2338 case OPCODE_LRP:
2339 emit_lrp(c, inst);
2340 break;
2341 case OPCODE_TRUNC:
2342 emit_trunc(c, inst);
2343 break;
2344 case OPCODE_MOV:
2345 emit_mov(c, inst);
2346 break;
2347 case OPCODE_DP3:
2348 emit_dp3(c, inst);
2349 break;
2350 case OPCODE_DP4:
2351 emit_dp4(c, inst);
2352 break;
2353 case OPCODE_XPD:
2354 emit_xpd(c, inst);
2355 break;
2356 case OPCODE_DPH:
2357 emit_dph(c, inst);
2358 break;
2359 case OPCODE_RCP:
2360 emit_rcp(c, inst);
2361 break;
2362 case OPCODE_RSQ:
2363 emit_rsq(c, inst);
2364 break;
2365 case OPCODE_SIN:
2366 emit_sin(c, inst);
2367 break;
2368 case OPCODE_COS:
2369 emit_cos(c, inst);
2370 break;
2371 case OPCODE_EX2:
2372 emit_ex2(c, inst);
2373 break;
2374 case OPCODE_LG2:
2375 emit_lg2(c, inst);
2376 break;
2377 case OPCODE_MAX:
2378 emit_max(c, inst);
2379 break;
2380 case OPCODE_MIN:
2381 emit_min(c, inst);
2382 break;
2383 case OPCODE_DDX:
2384 emit_ddx(c, inst);
2385 break;
2386 case OPCODE_DDY:
2387 emit_ddy(c, inst);
2388 break;
2389 case OPCODE_SLT:
2390 emit_slt(c, inst);
2391 break;
2392 case OPCODE_SLE:
2393 emit_sle(c, inst);
2394 break;
2395 case OPCODE_SGT:
2396 emit_sgt(c, inst);
2397 break;
2398 case OPCODE_SGE:
2399 emit_sge(c, inst);
2400 break;
2401 case OPCODE_SEQ:
2402 emit_seq(c, inst);
2403 break;
2404 case OPCODE_SNE:
2405 emit_sne(c, inst);
2406 break;
2407 case OPCODE_MUL:
2408 emit_mul(c, inst);
2409 break;
2410 case OPCODE_POW:
2411 emit_pow(c, inst);
2412 break;
2413 case OPCODE_MAD:
2414 emit_mad(c, inst);
2415 break;
2416 case OPCODE_NOISE1:
2417 emit_noise1(c, inst);
2418 break;
2419 case OPCODE_NOISE2:
2420 emit_noise2(c, inst);
2421 break;
2422 case OPCODE_NOISE3:
2423 emit_noise3(c, inst);
2424 break;
2425 case OPCODE_NOISE4:
2426 emit_noise4(c, inst);
2427 break;
2428 case OPCODE_TEX:
2429 emit_tex(c, inst);
2430 break;
2431 case OPCODE_TXB:
2432 emit_txb(c, inst);
2433 break;
2434 case OPCODE_KIL_NV:
2435 emit_kil(c);
2436 break;
2437 case OPCODE_IF:
2438 assert(if_insn < MAX_IFSN);
2439 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2440 break;
2441 case OPCODE_ELSE:
2442 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2443 break;
2444 case OPCODE_ENDIF:
2445 assert(if_insn > 0);
2446 brw_ENDIF(p, if_inst[--if_insn]);
2447 break;
2448 case OPCODE_BGNSUB:
2449 case OPCODE_ENDSUB:
2450 break;
2451 case OPCODE_CAL:
2452 brw_push_insn_state(p);
2453 brw_set_mask_control(p, BRW_MASK_DISABLE);
2454 brw_set_access_mode(p, BRW_ALIGN_1);
2455 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2456 brw_set_access_mode(p, BRW_ALIGN_16);
2457 brw_ADD(p, get_addr_reg(stack_index),
2458 get_addr_reg(stack_index), brw_imm_d(4));
2459 orig_inst = inst->Data;
2460 orig_inst->Data = &p->store[p->nr_insn];
2461 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2462 brw_pop_insn_state(p);
2463 break;
2464
2465 case OPCODE_RET:
2466 brw_push_insn_state(p);
2467 brw_set_mask_control(p, BRW_MASK_DISABLE);
2468 brw_ADD(p, get_addr_reg(stack_index),
2469 get_addr_reg(stack_index), brw_imm_d(-4));
2470 brw_set_access_mode(p, BRW_ALIGN_1);
2471 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2472 brw_set_access_mode(p, BRW_ALIGN_16);
2473 brw_pop_insn_state(p);
2474
2475 break;
2476 case OPCODE_BGNLOOP:
2477 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2478 break;
2479 case OPCODE_BRK:
2480 brw_BREAK(p);
2481 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2482 break;
2483 case OPCODE_CONT:
2484 brw_CONT(p);
2485 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2486 break;
2487 case OPCODE_ENDLOOP:
2488 loop_insn--;
2489 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2490 /* patch all the BREAK instructions from
2491 last BEGINLOOP */
2492 while (inst0 > loop_inst[loop_insn]) {
2493 inst0--;
2494 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2495 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2496 inst0->bits3.if_else.pop_count = 0;
2497 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2498 inst0->bits3.if_else.jump_count = inst1 - inst0;
2499 inst0->bits3.if_else.pop_count = 0;
2500 }
2501 }
2502 break;
2503 default:
2504 _mesa_printf("unsupported IR in fragment shader %d\n",
2505 inst->Opcode);
2506 }
2507 if (inst->CondUpdate)
2508 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2509 else
2510 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2511 }
2512 post_wm_emit(c);
2513 for (i = 0; i < c->fp->program.Base.NumInstructions; i++)
2514 c->fp->program.Base.Instructions[i].Data = NULL;
2515 }
2516
2517 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2518 {
2519 brw_wm_pass_fp(c);
2520 brw_wm_emit_glsl(brw, c);
2521 c->prog_data.total_grf = c->reg_index;
2522 c->prog_data.total_scratch = 0;
2523 }