Merge commit 'origin/gallium-0.1'
[mesa.git] / src / mesa / drivers / dri / i965 / brw_wm_glsl.c
1 #include "main/macros.h"
2 #include "shader/prog_parameter.h"
3 #include "brw_context.h"
4 #include "brw_eu.h"
5 #include "brw_wm.h"
6
7 enum _subroutine {
8 SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
9 };
10
11
12 /**
13 * Determine if the given fragment program uses GLSL features such
14 * as flow conditionals, loops, subroutines.
15 * Some GLSL shaders may use these features, others might not.
16 */
17 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
18 {
19 int i;
20 for (i = 0; i < fp->Base.NumInstructions; i++) {
21 const struct prog_instruction *inst = &fp->Base.Instructions[i];
22 switch (inst->Opcode) {
23 case OPCODE_IF:
24 case OPCODE_TRUNC:
25 case OPCODE_ENDIF:
26 case OPCODE_CAL:
27 case OPCODE_BRK:
28 case OPCODE_RET:
29 case OPCODE_DDX:
30 case OPCODE_DDY:
31 case OPCODE_NOISE1:
32 case OPCODE_NOISE2:
33 case OPCODE_NOISE3:
34 case OPCODE_NOISE4:
35 case OPCODE_BGNLOOP:
36 return GL_TRUE;
37 default:
38 break;
39 }
40 }
41 return GL_FALSE;
42 }
43
44 static void set_reg(struct brw_wm_compile *c, int file, int index,
45 int component, struct brw_reg reg)
46 {
47 c->wm_regs[file][index][component].reg = reg;
48 c->wm_regs[file][index][component].inited = GL_TRUE;
49 }
50
51 static int get_scalar_dst_index(struct prog_instruction *inst)
52 {
53 int i;
54 for (i = 0; i < 4; i++)
55 if (inst->DstReg.WriteMask & (1<<i))
56 break;
57 return i;
58 }
59
60 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
61 {
62 struct brw_reg reg;
63 if(c->tmp_index == c->tmp_max)
64 c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
65
66 reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
67 return reg;
68 }
69
70 static int mark_tmps(struct brw_wm_compile *c)
71 {
72 return c->tmp_index;
73 }
74
75 static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
76 {
77 return brw_vec8_grf( c->tmp_regs[ index ], 0 );
78 }
79
80 static void release_tmps(struct brw_wm_compile *c, int mark)
81 {
82 c->tmp_index = mark;
83 }
84
85 static struct brw_reg
86 get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs)
87 {
88 struct brw_reg reg;
89 switch (file) {
90 case PROGRAM_STATE_VAR:
91 case PROGRAM_CONSTANT:
92 case PROGRAM_UNIFORM:
93 file = PROGRAM_STATE_VAR;
94 break;
95 case PROGRAM_UNDEFINED:
96 return brw_null_reg();
97 default:
98 break;
99 }
100
101 if(c->wm_regs[file][index][component].inited)
102 reg = c->wm_regs[file][index][component].reg;
103 else
104 reg = brw_vec8_grf(c->reg_index, 0);
105
106 if(!c->wm_regs[file][index][component].inited) {
107 set_reg(c, file, index, component, reg);
108 c->reg_index++;
109 }
110
111 if (neg & (1<< component)) {
112 reg = negate(reg);
113 }
114 if (abs)
115 reg = brw_abs(reg);
116 return reg;
117 }
118
119 static void prealloc_reg(struct brw_wm_compile *c)
120 {
121 int i, j;
122 struct brw_reg reg;
123 int nr_interp_regs = 0;
124 GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
125
126 for (i = 0; i < 4; i++) {
127 reg = (i < c->key.nr_depth_regs)
128 ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
129 set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
130 }
131 c->reg_index += 2*c->key.nr_depth_regs;
132 {
133 int nr_params = c->fp->program.Base.Parameters->NumParameters;
134 struct gl_program_parameter_list *plist =
135 c->fp->program.Base.Parameters;
136 int index = 0;
137 c->prog_data.nr_params = 4*nr_params;
138 for (i = 0; i < nr_params; i++) {
139 for (j = 0; j < 4; j++, index++) {
140 reg = brw_vec1_grf(c->reg_index + index/8,
141 index%8);
142 c->prog_data.param[index] =
143 &plist->ParameterValues[i][j];
144 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
145 }
146 }
147 c->nr_creg = 2*((4*nr_params+15)/16);
148 c->reg_index += c->nr_creg;
149 }
150 for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
151 if (inputs & (1<<i)) {
152 nr_interp_regs++;
153 reg = brw_vec8_grf(c->reg_index, 0);
154 for (j = 0; j < 4; j++)
155 set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
156 c->reg_index += 2;
157
158 }
159 }
160 c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
161 c->prog_data.urb_read_length = nr_interp_regs * 2;
162 c->prog_data.curb_read_length = c->nr_creg;
163 c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
164 c->reg_index++;
165 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
166 c->reg_index += 2;
167 }
168
169 static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
170 struct prog_instruction *inst, int component, int nr)
171 {
172 return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
173 0, 0);
174 }
175
176 static struct brw_reg get_src_reg(struct brw_wm_compile *c,
177 struct prog_src_register *src, int index, int nr)
178 {
179 int component = GET_SWZ(src->Swizzle, index);
180 return get_reg(c, src->File, src->Index, component, nr,
181 src->NegateBase, src->Abs);
182 }
183
184 /**
185 * Subroutines are minimal support for resusable instruction sequences.
186 * They are implemented as simply as possible to minimise overhead: there
187 * is no explicit support for communication between the caller and callee
188 * other than saving the return address in a temporary register, nor is
189 * there any automatic local storage. This implies that great care is
190 * required before attempting reentrancy or any kind of nested
191 * subroutine invocations.
192 */
193 static void invoke_subroutine( struct brw_wm_compile *c,
194 enum _subroutine subroutine,
195 void (*emit)( struct brw_wm_compile * ) )
196 {
197 struct brw_compile *p = &c->func;
198
199 assert( subroutine < BRW_WM_MAX_SUBROUTINE );
200
201 if( c->subroutines[ subroutine ] ) {
202 /* subroutine previously emitted: reuse existing instructions */
203
204 int mark = mark_tmps( c );
205 struct brw_reg return_address = retype( alloc_tmp( c ),
206 BRW_REGISTER_TYPE_UD );
207 int here = p->nr_insn;
208
209 brw_push_insn_state(p);
210 brw_set_mask_control(p, BRW_MASK_DISABLE);
211 brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
212
213 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
214 brw_imm_d( ( c->subroutines[ subroutine ] -
215 here - 1 ) << 4 ) );
216 brw_pop_insn_state(p);
217
218 release_tmps( c, mark );
219 } else {
220 /* previously unused subroutine: emit, and mark for later reuse */
221
222 int mark = mark_tmps( c );
223 struct brw_reg return_address = retype( alloc_tmp( c ),
224 BRW_REGISTER_TYPE_UD );
225 struct brw_instruction *calc;
226 int base = p->nr_insn;
227
228 brw_push_insn_state(p);
229 brw_set_mask_control(p, BRW_MASK_DISABLE);
230 calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
231 brw_pop_insn_state(p);
232
233 c->subroutines[ subroutine ] = p->nr_insn;
234
235 emit( c );
236
237 brw_push_insn_state(p);
238 brw_set_mask_control(p, BRW_MASK_DISABLE);
239 brw_MOV( p, brw_ip_reg(), return_address );
240 brw_pop_insn_state(p);
241
242 brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
243
244 release_tmps( c, mark );
245 }
246 }
247
248 static void emit_abs( struct brw_wm_compile *c,
249 struct prog_instruction *inst)
250 {
251 int i;
252 struct brw_compile *p = &c->func;
253 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
254 for (i = 0; i < 4; i++) {
255 if (inst->DstReg.WriteMask & (1<<i)) {
256 struct brw_reg src, dst;
257 dst = get_dst_reg(c, inst, i, 1);
258 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
259 brw_MOV(p, dst, brw_abs(src));
260 }
261 }
262 brw_set_saturate(p, 0);
263 }
264
265 static void emit_trunc( struct brw_wm_compile *c,
266 struct prog_instruction *inst)
267 {
268 int i;
269 struct brw_compile *p = &c->func;
270 GLuint mask = inst->DstReg.WriteMask;
271 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
272 for (i = 0; i < 4; i++) {
273 if (mask & (1<<i)) {
274 struct brw_reg src, dst;
275 dst = get_dst_reg(c, inst, i, 1) ;
276 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
277 brw_RNDZ(p, dst, src);
278 }
279 }
280 brw_set_saturate(p, 0);
281 }
282
283 static void emit_mov( struct brw_wm_compile *c,
284 struct prog_instruction *inst)
285 {
286 int i;
287 struct brw_compile *p = &c->func;
288 GLuint mask = inst->DstReg.WriteMask;
289 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
290 for (i = 0; i < 4; i++) {
291 if (mask & (1<<i)) {
292 struct brw_reg src, dst;
293 dst = get_dst_reg(c, inst, i, 1);
294 src = get_src_reg(c, &inst->SrcReg[0], i, 1);
295 brw_MOV(p, dst, src);
296 }
297 }
298 brw_set_saturate(p, 0);
299 }
300
301 static void emit_pixel_xy(struct brw_wm_compile *c,
302 struct prog_instruction *inst)
303 {
304 struct brw_reg r1 = brw_vec1_grf(1, 0);
305 struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
306
307 struct brw_reg dst0, dst1;
308 struct brw_compile *p = &c->func;
309 GLuint mask = inst->DstReg.WriteMask;
310
311 dst0 = get_dst_reg(c, inst, 0, 1);
312 dst1 = get_dst_reg(c, inst, 1, 1);
313 /* Calculate pixel centers by adding 1 or 0 to each of the
314 * micro-tile coordinates passed in r1.
315 */
316 if (mask & WRITEMASK_X) {
317 brw_ADD(p,
318 vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
319 stride(suboffset(r1_uw, 4), 2, 4, 0),
320 brw_imm_v(0x10101010));
321 }
322
323 if (mask & WRITEMASK_Y) {
324 brw_ADD(p,
325 vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
326 stride(suboffset(r1_uw, 5), 2, 4, 0),
327 brw_imm_v(0x11001100));
328 }
329 }
330
331 static void emit_delta_xy(struct brw_wm_compile *c,
332 struct prog_instruction *inst)
333 {
334 struct brw_reg r1 = brw_vec1_grf(1, 0);
335 struct brw_reg dst0, dst1, src0, src1;
336 struct brw_compile *p = &c->func;
337 GLuint mask = inst->DstReg.WriteMask;
338
339 dst0 = get_dst_reg(c, inst, 0, 1);
340 dst1 = get_dst_reg(c, inst, 1, 1);
341 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
342 src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
343 /* Calc delta X,Y by subtracting origin in r1 from the pixel
344 * centers.
345 */
346 if (mask & WRITEMASK_X) {
347 brw_ADD(p,
348 dst0,
349 retype(src0, BRW_REGISTER_TYPE_UW),
350 negate(r1));
351 }
352
353 if (mask & WRITEMASK_Y) {
354 brw_ADD(p,
355 dst1,
356 retype(src1, BRW_REGISTER_TYPE_UW),
357 negate(suboffset(r1,1)));
358
359 }
360 }
361
362 static void fire_fb_write( struct brw_wm_compile *c,
363 GLuint base_reg,
364 GLuint nr,
365 GLuint target,
366 GLuint eot)
367 {
368 struct brw_compile *p = &c->func;
369 /* Pass through control information:
370 */
371 /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
372 {
373 brw_push_insn_state(p);
374 brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
375 brw_MOV(p,
376 brw_message_reg(base_reg + 1),
377 brw_vec8_grf(1, 0));
378 brw_pop_insn_state(p);
379 }
380 /* Send framebuffer write message: */
381 brw_fb_WRITE(p,
382 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
383 base_reg,
384 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
385 target,
386 nr,
387 0,
388 eot);
389 }
390
391 static void emit_fb_write(struct brw_wm_compile *c,
392 struct prog_instruction *inst)
393 {
394 struct brw_compile *p = &c->func;
395 int nr = 2;
396 int channel;
397 GLuint target, eot;
398 struct brw_reg src0;
399
400 /* Reserve a space for AA - may not be needed:
401 */
402 if (c->key.aa_dest_stencil_reg)
403 nr += 1;
404
405 brw_push_insn_state(p);
406 for (channel = 0; channel < 4; channel++) {
407 src0 = get_src_reg(c, &inst->SrcReg[0], channel, 1);
408 /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
409 /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
410 brw_MOV(p, brw_message_reg(nr + channel), src0);
411 }
412 /* skip over the regs populated above: */
413 nr += 8;
414 brw_pop_insn_state(p);
415
416 if (c->key.source_depth_to_render_target) {
417 if (c->key.computes_depth) {
418 src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
419 brw_MOV(p, brw_message_reg(nr), src0);
420 }
421 else {
422 src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
423 brw_MOV(p, brw_message_reg(nr), src0);
424 }
425
426 nr += 2;
427 }
428
429 target = inst->Sampler >> 1;
430 eot = inst->Sampler & 1;
431 fire_fb_write(c, 0, nr, target, eot);
432 }
433
434 static void emit_pixel_w( struct brw_wm_compile *c,
435 struct prog_instruction *inst)
436 {
437 struct brw_compile *p = &c->func;
438 GLuint mask = inst->DstReg.WriteMask;
439 if (mask & WRITEMASK_W) {
440 struct brw_reg dst, src0, delta0, delta1;
441 struct brw_reg interp3;
442
443 dst = get_dst_reg(c, inst, 3, 1);
444 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
445 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
446 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
447
448 interp3 = brw_vec1_grf(src0.nr+1, 4);
449 /* Calc 1/w - just linterp wpos[3] optimized by putting the
450 * result straight into a message reg.
451 */
452 brw_LINE(p, brw_null_reg(), interp3, delta0);
453 brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
454
455 /* Calc w */
456 brw_math_16( p, dst,
457 BRW_MATH_FUNCTION_INV,
458 BRW_MATH_SATURATE_NONE,
459 2, brw_null_reg(),
460 BRW_MATH_PRECISION_FULL);
461 }
462 }
463
464 static void emit_linterp(struct brw_wm_compile *c,
465 struct prog_instruction *inst)
466 {
467 struct brw_compile *p = &c->func;
468 GLuint mask = inst->DstReg.WriteMask;
469 struct brw_reg interp[4];
470 struct brw_reg dst, delta0, delta1;
471 struct brw_reg src0;
472
473 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
474 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
475 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
476 GLuint nr = src0.nr;
477 int i;
478
479 interp[0] = brw_vec1_grf(nr, 0);
480 interp[1] = brw_vec1_grf(nr, 4);
481 interp[2] = brw_vec1_grf(nr+1, 0);
482 interp[3] = brw_vec1_grf(nr+1, 4);
483
484 for(i = 0; i < 4; i++ ) {
485 if (mask & (1<<i)) {
486 dst = get_dst_reg(c, inst, i, 1);
487 brw_LINE(p, brw_null_reg(), interp[i], delta0);
488 brw_MAC(p, dst, suboffset(interp[i],1), delta1);
489 }
490 }
491 }
492
493 static void emit_cinterp(struct brw_wm_compile *c,
494 struct prog_instruction *inst)
495 {
496 struct brw_compile *p = &c->func;
497 GLuint mask = inst->DstReg.WriteMask;
498
499 struct brw_reg interp[4];
500 struct brw_reg dst, src0;
501
502 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
503 GLuint nr = src0.nr;
504 int i;
505
506 interp[0] = brw_vec1_grf(nr, 0);
507 interp[1] = brw_vec1_grf(nr, 4);
508 interp[2] = brw_vec1_grf(nr+1, 0);
509 interp[3] = brw_vec1_grf(nr+1, 4);
510
511 for(i = 0; i < 4; i++ ) {
512 if (mask & (1<<i)) {
513 dst = get_dst_reg(c, inst, i, 1);
514 brw_MOV(p, dst, suboffset(interp[i],3));
515 }
516 }
517 }
518
519 static void emit_pinterp(struct brw_wm_compile *c,
520 struct prog_instruction *inst)
521 {
522 struct brw_compile *p = &c->func;
523 GLuint mask = inst->DstReg.WriteMask;
524
525 struct brw_reg interp[4];
526 struct brw_reg dst, delta0, delta1;
527 struct brw_reg src0, w;
528
529 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
530 delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
531 delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
532 w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
533 GLuint nr = src0.nr;
534 int i;
535
536 interp[0] = brw_vec1_grf(nr, 0);
537 interp[1] = brw_vec1_grf(nr, 4);
538 interp[2] = brw_vec1_grf(nr+1, 0);
539 interp[3] = brw_vec1_grf(nr+1, 4);
540
541 for(i = 0; i < 4; i++ ) {
542 if (mask & (1<<i)) {
543 dst = get_dst_reg(c, inst, i, 1);
544 brw_LINE(p, brw_null_reg(), interp[i], delta0);
545 brw_MAC(p, dst, suboffset(interp[i],1),
546 delta1);
547 brw_MUL(p, dst, dst, w);
548 }
549 }
550 }
551
552 static void emit_xpd(struct brw_wm_compile *c,
553 struct prog_instruction *inst)
554 {
555 int i;
556 struct brw_compile *p = &c->func;
557 GLuint mask = inst->DstReg.WriteMask;
558 for (i = 0; i < 4; i++) {
559 GLuint i2 = (i+2)%3;
560 GLuint i1 = (i+1)%3;
561 if (mask & (1<<i)) {
562 struct brw_reg src0, src1, dst;
563 dst = get_dst_reg(c, inst, i, 1);
564 src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
565 src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
566 brw_MUL(p, brw_null_reg(), src0, src1);
567 src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
568 src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
569 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
570 brw_MAC(p, dst, src0, src1);
571 brw_set_saturate(p, 0);
572 }
573 }
574 brw_set_saturate(p, 0);
575 }
576
577 static void emit_dp3(struct brw_wm_compile *c,
578 struct prog_instruction *inst)
579 {
580 struct brw_reg src0[3], src1[3], dst;
581 int i;
582 struct brw_compile *p = &c->func;
583 for (i = 0; i < 3; i++) {
584 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
585 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
586 }
587
588 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
589 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
590 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
591 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
592 brw_MAC(p, dst, src0[2], src1[2]);
593 brw_set_saturate(p, 0);
594 }
595
596 static void emit_dp4(struct brw_wm_compile *c,
597 struct prog_instruction *inst)
598 {
599 struct brw_reg src0[4], src1[4], dst;
600 int i;
601 struct brw_compile *p = &c->func;
602 for (i = 0; i < 4; i++) {
603 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
604 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
605 }
606 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
607 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
608 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
609 brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
610 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
611 brw_MAC(p, dst, src0[3], src1[3]);
612 brw_set_saturate(p, 0);
613 }
614
615 static void emit_dph(struct brw_wm_compile *c,
616 struct prog_instruction *inst)
617 {
618 struct brw_reg src0[4], src1[4], dst;
619 int i;
620 struct brw_compile *p = &c->func;
621 for (i = 0; i < 4; i++) {
622 src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
623 src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
624 }
625 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
626 brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
627 brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
628 brw_MAC(p, dst, src0[2], src1[2]);
629 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
630 brw_ADD(p, dst, dst, src1[3]);
631 brw_set_saturate(p, 0);
632 }
633
634 static void emit_math1(struct brw_wm_compile *c,
635 struct prog_instruction *inst, GLuint func)
636 {
637 struct brw_compile *p = &c->func;
638 struct brw_reg src0, dst;
639
640 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
641 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
642 brw_MOV(p, brw_message_reg(2), src0);
643 brw_math(p,
644 dst,
645 func,
646 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
647 2,
648 brw_null_reg(),
649 BRW_MATH_DATA_VECTOR,
650 BRW_MATH_PRECISION_FULL);
651 }
652
653 static void emit_rcp(struct brw_wm_compile *c,
654 struct prog_instruction *inst)
655 {
656 emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
657 }
658
659 static void emit_rsq(struct brw_wm_compile *c,
660 struct prog_instruction *inst)
661 {
662 emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
663 }
664
665 static void emit_sin(struct brw_wm_compile *c,
666 struct prog_instruction *inst)
667 {
668 emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
669 }
670
671 static void emit_cos(struct brw_wm_compile *c,
672 struct prog_instruction *inst)
673 {
674 emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
675 }
676
677 static void emit_ex2(struct brw_wm_compile *c,
678 struct prog_instruction *inst)
679 {
680 emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
681 }
682
683 static void emit_lg2(struct brw_wm_compile *c,
684 struct prog_instruction *inst)
685 {
686 emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
687 }
688
689 static void emit_add(struct brw_wm_compile *c,
690 struct prog_instruction *inst)
691 {
692 struct brw_compile *p = &c->func;
693 struct brw_reg src0, src1, dst;
694 GLuint mask = inst->DstReg.WriteMask;
695 int i;
696 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
697 for (i = 0 ; i < 4; i++) {
698 if (mask & (1<<i)) {
699 dst = get_dst_reg(c, inst, i, 1);
700 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
701 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
702 brw_ADD(p, dst, src0, src1);
703 }
704 }
705 brw_set_saturate(p, 0);
706 }
707
708 static void emit_sub(struct brw_wm_compile *c,
709 struct prog_instruction *inst)
710 {
711 struct brw_compile *p = &c->func;
712 struct brw_reg src0, src1, dst;
713 GLuint mask = inst->DstReg.WriteMask;
714 int i;
715 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
716 for (i = 0 ; i < 4; i++) {
717 if (mask & (1<<i)) {
718 dst = get_dst_reg(c, inst, i, 1);
719 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
720 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
721 brw_ADD(p, dst, src0, negate(src1));
722 }
723 }
724 brw_set_saturate(p, 0);
725 }
726
727 static void emit_mul(struct brw_wm_compile *c,
728 struct prog_instruction *inst)
729 {
730 struct brw_compile *p = &c->func;
731 struct brw_reg src0, src1, dst;
732 GLuint mask = inst->DstReg.WriteMask;
733 int i;
734 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
735 for (i = 0 ; i < 4; i++) {
736 if (mask & (1<<i)) {
737 dst = get_dst_reg(c, inst, i, 1);
738 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
739 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
740 brw_MUL(p, dst, src0, src1);
741 }
742 }
743 brw_set_saturate(p, 0);
744 }
745
746 static void emit_frc(struct brw_wm_compile *c,
747 struct prog_instruction *inst)
748 {
749 struct brw_compile *p = &c->func;
750 struct brw_reg src0, dst;
751 GLuint mask = inst->DstReg.WriteMask;
752 int i;
753 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
754 for (i = 0 ; i < 4; i++) {
755 if (mask & (1<<i)) {
756 dst = get_dst_reg(c, inst, i, 1);
757 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
758 brw_FRC(p, dst, src0);
759 }
760 }
761 if (inst->SaturateMode != SATURATE_OFF)
762 brw_set_saturate(p, 0);
763 }
764
765 static void emit_flr(struct brw_wm_compile *c,
766 struct prog_instruction *inst)
767 {
768 struct brw_compile *p = &c->func;
769 struct brw_reg src0, dst;
770 GLuint mask = inst->DstReg.WriteMask;
771 int i;
772 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
773 for (i = 0 ; i < 4; i++) {
774 if (mask & (1<<i)) {
775 dst = get_dst_reg(c, inst, i, 1);
776 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
777 brw_RNDD(p, dst, src0);
778 }
779 }
780 brw_set_saturate(p, 0);
781 }
782
783 static void emit_max(struct brw_wm_compile *c,
784 struct prog_instruction *inst)
785 {
786 struct brw_compile *p = &c->func;
787 GLuint mask = inst->DstReg.WriteMask;
788 struct brw_reg src0, src1, dst;
789 int i;
790 brw_push_insn_state(p);
791 for (i = 0; i < 4; i++) {
792 if (mask & (1<<i)) {
793 dst = get_dst_reg(c, inst, i, 1);
794 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
795 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
796 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
797 brw_MOV(p, dst, src0);
798 brw_set_saturate(p, 0);
799
800 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
801 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
802 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
803 brw_MOV(p, dst, src1);
804 brw_set_saturate(p, 0);
805 brw_set_predicate_control_flag_value(p, 0xff);
806 }
807 }
808 brw_pop_insn_state(p);
809 }
810
811 static void emit_min(struct brw_wm_compile *c,
812 struct prog_instruction *inst)
813 {
814 struct brw_compile *p = &c->func;
815 GLuint mask = inst->DstReg.WriteMask;
816 struct brw_reg src0, src1, dst;
817 int i;
818 brw_push_insn_state(p);
819 for (i = 0; i < 4; i++) {
820 if (mask & (1<<i)) {
821 dst = get_dst_reg(c, inst, i, 1);
822 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
823 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
824 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
825 brw_MOV(p, dst, src0);
826 brw_set_saturate(p, 0);
827
828 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
829 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
830 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
831 brw_MOV(p, dst, src1);
832 brw_set_saturate(p, 0);
833 brw_set_predicate_control_flag_value(p, 0xff);
834 }
835 }
836 brw_pop_insn_state(p);
837 }
838
839 static void emit_pow(struct brw_wm_compile *c,
840 struct prog_instruction *inst)
841 {
842 struct brw_compile *p = &c->func;
843 struct brw_reg dst, src0, src1;
844 dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
845 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
846 src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
847
848 brw_MOV(p, brw_message_reg(2), src0);
849 brw_MOV(p, brw_message_reg(3), src1);
850
851 brw_math(p,
852 dst,
853 BRW_MATH_FUNCTION_POW,
854 (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
855 2,
856 brw_null_reg(),
857 BRW_MATH_DATA_VECTOR,
858 BRW_MATH_PRECISION_FULL);
859 }
860
861 static void emit_lrp(struct brw_wm_compile *c,
862 struct prog_instruction *inst)
863 {
864 struct brw_compile *p = &c->func;
865 GLuint mask = inst->DstReg.WriteMask;
866 struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
867 int i;
868 int mark = mark_tmps(c);
869 for (i = 0; i < 4; i++) {
870 if (mask & (1<<i)) {
871 dst = get_dst_reg(c, inst, i, 1);
872 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
873
874 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
875
876 if (src1.nr == dst.nr) {
877 tmp1 = alloc_tmp(c);
878 brw_MOV(p, tmp1, src1);
879 } else
880 tmp1 = src1;
881
882 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
883 if (src2.nr == dst.nr) {
884 tmp2 = alloc_tmp(c);
885 brw_MOV(p, tmp2, src2);
886 } else
887 tmp2 = src2;
888
889 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
890 brw_MUL(p, brw_null_reg(), dst, tmp2);
891 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
892 brw_MAC(p, dst, src0, tmp1);
893 brw_set_saturate(p, 0);
894 }
895 release_tmps(c, mark);
896 }
897 }
898
899 /**
900 * For GLSL shaders, this KIL will be unconditional.
901 * It may be contained inside an IF/ENDIF structure of course.
902 */
903 static void emit_kil(struct brw_wm_compile *c)
904 {
905 struct brw_compile *p = &c->func;
906 struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
907 brw_push_insn_state(p);
908 brw_set_mask_control(p, BRW_MASK_DISABLE);
909 brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
910 brw_AND(p, depth, c->emit_mask_reg, depth);
911 brw_pop_insn_state(p);
912 }
913
914 static void emit_mad(struct brw_wm_compile *c,
915 struct prog_instruction *inst)
916 {
917 struct brw_compile *p = &c->func;
918 GLuint mask = inst->DstReg.WriteMask;
919 struct brw_reg dst, src0, src1, src2;
920 int i;
921
922 for (i = 0; i < 4; i++) {
923 if (mask & (1<<i)) {
924 dst = get_dst_reg(c, inst, i, 1);
925 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
926 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
927 src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
928 brw_MUL(p, dst, src0, src1);
929
930 brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
931 brw_ADD(p, dst, dst, src2);
932 brw_set_saturate(p, 0);
933 }
934 }
935 }
936
937 static void emit_sop(struct brw_wm_compile *c,
938 struct prog_instruction *inst, GLuint cond)
939 {
940 struct brw_compile *p = &c->func;
941 GLuint mask = inst->DstReg.WriteMask;
942 struct brw_reg dst, src0, src1;
943 int i;
944
945 for (i = 0; i < 4; i++) {
946 if (mask & (1<<i)) {
947 dst = get_dst_reg(c, inst, i, 1);
948 src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
949 src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
950 brw_push_insn_state(p);
951 brw_CMP(p, brw_null_reg(), cond, src0, src1);
952 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
953 brw_MOV(p, dst, brw_imm_f(0.0));
954 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
955 brw_MOV(p, dst, brw_imm_f(1.0));
956 brw_pop_insn_state(p);
957 }
958 }
959 }
960
961 static void emit_slt(struct brw_wm_compile *c,
962 struct prog_instruction *inst)
963 {
964 emit_sop(c, inst, BRW_CONDITIONAL_L);
965 }
966
967 static void emit_sle(struct brw_wm_compile *c,
968 struct prog_instruction *inst)
969 {
970 emit_sop(c, inst, BRW_CONDITIONAL_LE);
971 }
972
973 static void emit_sgt(struct brw_wm_compile *c,
974 struct prog_instruction *inst)
975 {
976 emit_sop(c, inst, BRW_CONDITIONAL_G);
977 }
978
979 static void emit_sge(struct brw_wm_compile *c,
980 struct prog_instruction *inst)
981 {
982 emit_sop(c, inst, BRW_CONDITIONAL_GE);
983 }
984
985 static void emit_seq(struct brw_wm_compile *c,
986 struct prog_instruction *inst)
987 {
988 emit_sop(c, inst, BRW_CONDITIONAL_EQ);
989 }
990
991 static void emit_sne(struct brw_wm_compile *c,
992 struct prog_instruction *inst)
993 {
994 emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
995 }
996
997 static void emit_ddx(struct brw_wm_compile *c,
998 struct prog_instruction *inst)
999 {
1000 struct brw_compile *p = &c->func;
1001 GLuint mask = inst->DstReg.WriteMask;
1002 struct brw_reg interp[4];
1003 struct brw_reg dst;
1004 struct brw_reg src0, w;
1005 GLuint nr, i;
1006 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1007 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1008 nr = src0.nr;
1009 interp[0] = brw_vec1_grf(nr, 0);
1010 interp[1] = brw_vec1_grf(nr, 4);
1011 interp[2] = brw_vec1_grf(nr+1, 0);
1012 interp[3] = brw_vec1_grf(nr+1, 4);
1013 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1014 for(i = 0; i < 4; i++ ) {
1015 if (mask & (1<<i)) {
1016 dst = get_dst_reg(c, inst, i, 1);
1017 brw_MOV(p, dst, interp[i]);
1018 brw_MUL(p, dst, dst, w);
1019 }
1020 }
1021 brw_set_saturate(p, 0);
1022 }
1023
1024 static void emit_ddy(struct brw_wm_compile *c,
1025 struct prog_instruction *inst)
1026 {
1027 struct brw_compile *p = &c->func;
1028 GLuint mask = inst->DstReg.WriteMask;
1029 struct brw_reg interp[4];
1030 struct brw_reg dst;
1031 struct brw_reg src0, w;
1032 GLuint nr, i;
1033
1034 src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
1035 nr = src0.nr;
1036 w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
1037 interp[0] = brw_vec1_grf(nr, 0);
1038 interp[1] = brw_vec1_grf(nr, 4);
1039 interp[2] = brw_vec1_grf(nr+1, 0);
1040 interp[3] = brw_vec1_grf(nr+1, 4);
1041 brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
1042 for(i = 0; i < 4; i++ ) {
1043 if (mask & (1<<i)) {
1044 dst = get_dst_reg(c, inst, i, 1);
1045 brw_MOV(p, dst, suboffset(interp[i], 1));
1046 brw_MUL(p, dst, dst, w);
1047 }
1048 }
1049 brw_set_saturate(p, 0);
1050 }
1051
1052 static INLINE struct brw_reg high_words( struct brw_reg reg )
1053 {
1054 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
1055 0, 8, 2 );
1056 }
1057
1058 static INLINE struct brw_reg low_words( struct brw_reg reg )
1059 {
1060 return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
1061 }
1062
1063 static INLINE struct brw_reg even_bytes( struct brw_reg reg )
1064 {
1065 return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
1066 }
1067
1068 static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
1069 {
1070 return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
1071 0, 16, 2 );
1072 }
1073
1074 /* One-, two- and three-dimensional Perlin noise, similar to the description
1075 in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
1076 static void noise1_sub( struct brw_wm_compile *c ) {
1077
1078 struct brw_compile *p = &c->func;
1079 struct brw_reg param,
1080 x0, x1, /* gradients at each end */
1081 t, tmp[ 2 ], /* float temporaries */
1082 itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
1083 int i;
1084 int mark = mark_tmps( c );
1085
1086 x0 = alloc_tmp( c );
1087 x1 = alloc_tmp( c );
1088 t = alloc_tmp( c );
1089 tmp[ 0 ] = alloc_tmp( c );
1090 tmp[ 1 ] = alloc_tmp( c );
1091 itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
1092 itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
1093 itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
1094 itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
1095 itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
1096
1097 param = lookup_tmp( c, mark - 2 );
1098
1099 brw_set_access_mode( p, BRW_ALIGN_1 );
1100
1101 brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1102
1103 /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
1104 be hashed. Also compute the remainder (offset within the unit
1105 length), interleaved to reduce register dependency penalties. */
1106 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param );
1107 brw_FRC( p, param, param );
1108 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
1109 brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1110 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1111
1112 /* We're now ready to perform the hashing. The two hashes are
1113 interleaved for performance. The hash function used is
1114 designed to rapidly achieve avalanche and require only 32x16
1115 bit multiplication, and 16-bit swizzles (which we get for
1116 free). We can't use immediate operands in the multiplies,
1117 because immediates are permitted only in src1 and the 16-bit
1118 factor is permitted only in src0. */
1119 for( i = 0; i < 2; i++ )
1120 brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
1121 for( i = 0; i < 2; i++ )
1122 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1123 high_words( itmp[ i ] ) );
1124 for( i = 0; i < 2; i++ )
1125 brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
1126 for( i = 0; i < 2; i++ )
1127 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1128 high_words( itmp[ i ] ) );
1129 for( i = 0; i < 2; i++ )
1130 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1131 for( i = 0; i < 2; i++ )
1132 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1133 high_words( itmp[ i ] ) );
1134
1135 /* Now we want to initialise the two gradients based on the
1136 hashes. Format conversion from signed integer to float leaves
1137 everything scaled too high by a factor of pow( 2, 31 ), but
1138 we correct for that right at the end. */
1139 brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
1140 brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
1141 brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
1142
1143 brw_MUL( p, x0, x0, param );
1144 brw_MUL( p, x1, x1, t );
1145
1146 /* We interpolate between the gradients using the polynomial
1147 6t^5 - 15t^4 + 10t^3 (Perlin). */
1148 brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
1149 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1150 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1151 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1152 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1153 brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
1154 pipeline */
1155 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
1156 brw_MUL( p, param, tmp[ 0 ], param );
1157 brw_MUL( p, x1, x1, param );
1158 brw_ADD( p, x0, x0, x1 );
1159 /* scale by pow( 2, -30 ), to compensate for the format conversion
1160 above and an extra factor of 2 so that a single gradient covers
1161 the [-1,1] range */
1162 brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
1163
1164 release_tmps( c, mark );
1165 }
1166
1167 static void emit_noise1( struct brw_wm_compile *c,
1168 struct prog_instruction *inst )
1169 {
1170 struct brw_compile *p = &c->func;
1171 struct brw_reg src, param, dst;
1172 GLuint mask = inst->DstReg.WriteMask;
1173 int i;
1174 int mark = mark_tmps( c );
1175
1176 assert( mark == 0 );
1177
1178 src = get_src_reg( c, inst->SrcReg, 0, 1 );
1179
1180 param = alloc_tmp( c );
1181
1182 brw_MOV( p, param, src );
1183
1184 invoke_subroutine( c, SUB_NOISE1, noise1_sub );
1185
1186 /* Fill in the result: */
1187 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1188 for (i = 0 ; i < 4; i++) {
1189 if (mask & (1<<i)) {
1190 dst = get_dst_reg(c, inst, i, 1);
1191 brw_MOV( p, dst, param );
1192 }
1193 }
1194 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1195 brw_set_saturate( p, 0 );
1196
1197 release_tmps( c, mark );
1198 }
1199
1200 static void noise2_sub( struct brw_wm_compile *c ) {
1201
1202 struct brw_compile *p = &c->func;
1203 struct brw_reg param0, param1,
1204 x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
1205 t, tmp[ 4 ], /* float temporaries */
1206 itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
1207 int i;
1208 int mark = mark_tmps( c );
1209
1210 x0y0 = alloc_tmp( c );
1211 x0y1 = alloc_tmp( c );
1212 x1y0 = alloc_tmp( c );
1213 x1y1 = alloc_tmp( c );
1214 t = alloc_tmp( c );
1215 for( i = 0; i < 4; i++ ) {
1216 tmp[ i ] = alloc_tmp( c );
1217 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1218 }
1219 itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
1220 itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
1221 itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
1222
1223 param0 = lookup_tmp( c, mark - 3 );
1224 param1 = lookup_tmp( c, mark - 2 );
1225
1226 brw_set_access_mode( p, BRW_ALIGN_1 );
1227
1228 /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
1229 be hashed. Also compute the remainders (offsets within the unit
1230 square), interleaved to reduce register dependency penalties. */
1231 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1232 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1233 brw_FRC( p, param0, param0 );
1234 brw_FRC( p, param1, param1 );
1235 brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
1236 brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
1237 low_words( itmp[ 1 ] ) );
1238 brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
1239 brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
1240 brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
1241 brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
1242 brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
1243
1244 /* We're now ready to perform the hashing. The four hashes are
1245 interleaved for performance. The hash function used is
1246 designed to rapidly achieve avalanche and require only 32x16
1247 bit multiplication, and 16-bit swizzles (which we get for
1248 free). We can't use immediate operands in the multiplies,
1249 because immediates are permitted only in src1 and the 16-bit
1250 factor is permitted only in src0. */
1251 for( i = 0; i < 4; i++ )
1252 brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
1253 for( i = 0; i < 4; i++ )
1254 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1255 high_words( itmp[ i ] ) );
1256 for( i = 0; i < 4; i++ )
1257 brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
1258 for( i = 0; i < 4; i++ )
1259 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1260 high_words( itmp[ i ] ) );
1261 for( i = 0; i < 4; i++ )
1262 brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
1263 for( i = 0; i < 4; i++ )
1264 brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
1265 high_words( itmp[ i ] ) );
1266
1267 /* Now we want to initialise the four gradients based on the
1268 hashes. Format conversion from signed integer to float leaves
1269 everything scaled too high by a factor of pow( 2, 15 ), but
1270 we correct for that right at the end. */
1271 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1272 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1273 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1274 brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
1275 brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
1276
1277 brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
1278 brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
1279 brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
1280 brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
1281
1282 brw_MUL( p, x1y0, x1y0, t );
1283 brw_MUL( p, x1y1, x1y1, t );
1284 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1285 brw_MUL( p, x0y0, x0y0, param0 );
1286 brw_MUL( p, x0y1, x0y1, param0 );
1287
1288 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
1289 brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
1290 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
1291 brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
1292
1293 brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
1294 brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
1295 brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
1296 brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
1297
1298 /* We interpolate between the gradients using the polynomial
1299 6t^5 - 15t^4 + 10t^3 (Perlin). */
1300 brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
1301 brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
1302 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
1303 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
1304 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1305 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1306 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
1307 pipeline */
1308 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
1309 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
1310 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1311 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1312 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
1313 pipeline */
1314 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
1315 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
1316 brw_MUL( p, param0, tmp[ 0 ], param0 );
1317 brw_MUL( p, param1, tmp[ 1 ], param1 );
1318
1319 /* Here we interpolate in the y dimension... */
1320 brw_MUL( p, x0y1, x0y1, param1 );
1321 brw_MUL( p, x1y1, x1y1, param1 );
1322 brw_ADD( p, x0y0, x0y0, x0y1 );
1323 brw_ADD( p, x1y0, x1y0, x1y1 );
1324
1325 /* And now in x. There are horrible register dependencies here,
1326 but we have nothing else to do. */
1327 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1328 brw_MUL( p, x1y0, x1y0, param0 );
1329 brw_ADD( p, x0y0, x0y0, x1y0 );
1330
1331 /* scale by pow( 2, -15 ), as described above */
1332 brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
1333
1334 release_tmps( c, mark );
1335 }
1336
1337 static void emit_noise2( struct brw_wm_compile *c,
1338 struct prog_instruction *inst )
1339 {
1340 struct brw_compile *p = &c->func;
1341 struct brw_reg src0, src1, param0, param1, dst;
1342 GLuint mask = inst->DstReg.WriteMask;
1343 int i;
1344 int mark = mark_tmps( c );
1345
1346 assert( mark == 0 );
1347
1348 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1349 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1350
1351 param0 = alloc_tmp( c );
1352 param1 = alloc_tmp( c );
1353
1354 brw_MOV( p, param0, src0 );
1355 brw_MOV( p, param1, src1 );
1356
1357 invoke_subroutine( c, SUB_NOISE2, noise2_sub );
1358
1359 /* Fill in the result: */
1360 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1361 for (i = 0 ; i < 4; i++) {
1362 if (mask & (1<<i)) {
1363 dst = get_dst_reg(c, inst, i, 1);
1364 brw_MOV( p, dst, param0 );
1365 }
1366 }
1367 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1368 brw_set_saturate( p, 0 );
1369
1370 release_tmps( c, mark );
1371 }
1372
1373 /**
1374 * The three-dimensional case is much like the one- and two- versions above,
1375 * but since the number of corners is rapidly growing we now pack 16 16-bit
1376 * hashes into each register to extract more parallelism from the EUs.
1377 */
1378 static void noise3_sub( struct brw_wm_compile *c ) {
1379
1380 struct brw_compile *p = &c->func;
1381 struct brw_reg param0, param1, param2,
1382 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1383 xi, yi, zi, /* interpolation coefficients */
1384 t, tmp[ 8 ], /* float temporaries */
1385 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1386 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1387 int i;
1388 int mark = mark_tmps( c );
1389
1390 x0y0 = alloc_tmp( c );
1391 x0y1 = alloc_tmp( c );
1392 x1y0 = alloc_tmp( c );
1393 x1y1 = alloc_tmp( c );
1394 xi = alloc_tmp( c );
1395 yi = alloc_tmp( c );
1396 zi = alloc_tmp( c );
1397 t = alloc_tmp( c );
1398 for( i = 0; i < 8; i++ ) {
1399 tmp[ i ] = alloc_tmp( c );
1400 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1401 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1402 }
1403
1404 param0 = lookup_tmp( c, mark - 4 );
1405 param1 = lookup_tmp( c, mark - 3 );
1406 param2 = lookup_tmp( c, mark - 2 );
1407
1408 brw_set_access_mode( p, BRW_ALIGN_1 );
1409
1410 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1411 be hashed. Also compute the remainders (offsets within the unit
1412 cube), interleaved to reduce register dependency penalties. */
1413 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 );
1414 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 );
1415 brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 );
1416 brw_FRC( p, param0, param0 );
1417 brw_FRC( p, param1, param1 );
1418 brw_FRC( p, param2, param2 );
1419 /* Since we now have only 16 bits of precision in the hash, we must
1420 be more careful about thorough mixing to maintain entropy as we
1421 squash the input vector into a small scalar. */
1422 brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) );
1423 brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) );
1424 brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ),
1425 brw_imm_uw( 0x9B93 ) );
1426 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1427 brw_imm_uw( 0xBC8F ) );
1428
1429 /* Temporarily disable the execution mask while we work with ExecSize=16
1430 channels (the mask is set for ExecSize=8 and is probably incorrect).
1431 Although this might cause execution of unwanted channels, the code
1432 writes only to temporary registers and has no side effects, so
1433 disabling the mask is harmless. */
1434 brw_push_insn_state( p );
1435 brw_set_mask_control( p, BRW_MASK_DISABLE );
1436 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1437 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1438 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1439
1440 /* We're now ready to perform the hashing. The eight hashes are
1441 interleaved for performance. The hash function used is
1442 designed to rapidly achieve avalanche and require only 16x16
1443 bit multiplication, and 8-bit swizzles (which we get for
1444 free). */
1445 for( i = 0; i < 4; i++ )
1446 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1447 for( i = 0; i < 4; i++ )
1448 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1449 odd_bytes( wtmp[ i ] ) );
1450 for( i = 0; i < 4; i++ )
1451 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1452 for( i = 0; i < 4; i++ )
1453 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1454 odd_bytes( wtmp[ i ] ) );
1455 brw_pop_insn_state( p );
1456
1457 /* Now we want to initialise the four rear gradients based on the
1458 hashes. Format conversion from signed integer to float leaves
1459 everything scaled too high by a factor of pow( 2, 15 ), but
1460 we correct for that right at the end. */
1461 /* x component */
1462 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1463 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1464 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1465 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1466 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1467
1468 brw_push_insn_state( p );
1469 brw_set_mask_control( p, BRW_MASK_DISABLE );
1470 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1471 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1472 brw_pop_insn_state( p );
1473
1474 brw_MUL( p, x1y0, x1y0, t );
1475 brw_MUL( p, x1y1, x1y1, t );
1476 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1477 brw_MUL( p, x0y0, x0y0, param0 );
1478 brw_MUL( p, x0y1, x0y1, param0 );
1479
1480 /* y component */
1481 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1482 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1483 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1484 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1485
1486 brw_push_insn_state( p );
1487 brw_set_mask_control( p, BRW_MASK_DISABLE );
1488 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
1489 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
1490 brw_pop_insn_state( p );
1491
1492 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1493 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1494 brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
1495 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1496 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1497
1498 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1499 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1500 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1501 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1502
1503 /* z component */
1504 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1505 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1506 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1507 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1508
1509 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
1510 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
1511 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
1512 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
1513
1514 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1515 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1516 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1517 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1518
1519 /* We interpolate between the gradients using the polynomial
1520 6t^5 - 15t^4 + 10t^3 (Perlin). */
1521 brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
1522 brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
1523 brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
1524 brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
1525 brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
1526 brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
1527 brw_MUL( p, xi, xi, param0 );
1528 brw_MUL( p, yi, yi, param1 );
1529 brw_MUL( p, zi, zi, param2 );
1530 brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
1531 brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
1532 brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
1533 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
1534 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
1535 brw_MUL( p, xi, xi, param0 );
1536 brw_MUL( p, yi, yi, param1 );
1537 brw_MUL( p, zi, zi, param2 );
1538 brw_MUL( p, xi, xi, param0 );
1539 brw_MUL( p, yi, yi, param1 );
1540 brw_MUL( p, zi, zi, param2 );
1541 brw_MUL( p, xi, xi, param0 );
1542 brw_MUL( p, yi, yi, param1 );
1543 brw_MUL( p, zi, zi, param2 );
1544
1545 /* Here we interpolate in the y dimension... */
1546 brw_MUL( p, x0y1, x0y1, yi );
1547 brw_MUL( p, x1y1, x1y1, yi );
1548 brw_ADD( p, x0y0, x0y0, x0y1 );
1549 brw_ADD( p, x1y0, x1y0, x1y1 );
1550
1551 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1552 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1553 brw_MUL( p, x1y0, x1y0, xi );
1554 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1555
1556 /* Now do the same thing for the front four gradients... */
1557 /* x component */
1558 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1559 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1560 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1561 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1562
1563 brw_push_insn_state( p );
1564 brw_set_mask_control( p, BRW_MASK_DISABLE );
1565 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1566 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1567 brw_pop_insn_state( p );
1568
1569 brw_MUL( p, x1y0, x1y0, t );
1570 brw_MUL( p, x1y1, x1y1, t );
1571 brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
1572 brw_MUL( p, x0y0, x0y0, param0 );
1573 brw_MUL( p, x0y1, x0y1, param0 );
1574
1575 /* y component */
1576 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1577 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1578 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1579 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1580
1581 brw_push_insn_state( p );
1582 brw_set_mask_control( p, BRW_MASK_DISABLE );
1583 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
1584 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
1585 brw_pop_insn_state( p );
1586
1587 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1588 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1589 brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
1590 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
1591 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
1592
1593 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1594 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1595 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1596 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1597
1598 /* z component */
1599 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1600 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1601 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1602 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1603
1604 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1605 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1606 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1607 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1608
1609 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1610 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1611 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1612 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1613
1614 /* The interpolation coefficients are still around from last time, so
1615 again interpolate in the y dimension... */
1616 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1617 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1618 brw_MUL( p, x0y1, x0y1, yi );
1619 brw_MUL( p, x1y1, x1y1, yi );
1620 brw_ADD( p, x0y0, x0y0, x0y1 );
1621 brw_ADD( p, x1y0, x1y0, x1y1 );
1622
1623 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
1624 time put the front face in tmp[ 1 ] and we're nearly there... */
1625 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1626 brw_MUL( p, x1y0, x1y0, xi );
1627 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
1628
1629 /* The final interpolation, in the z dimension: */
1630 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
1631 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
1632 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
1633
1634 /* scale by pow( 2, -15 ), as described above */
1635 brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
1636
1637 release_tmps( c, mark );
1638 }
1639
1640 static void emit_noise3( struct brw_wm_compile *c,
1641 struct prog_instruction *inst )
1642 {
1643 struct brw_compile *p = &c->func;
1644 struct brw_reg src0, src1, src2, param0, param1, param2, dst;
1645 GLuint mask = inst->DstReg.WriteMask;
1646 int i;
1647 int mark = mark_tmps( c );
1648
1649 assert( mark == 0 );
1650
1651 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
1652 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
1653 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
1654
1655 param0 = alloc_tmp( c );
1656 param1 = alloc_tmp( c );
1657 param2 = alloc_tmp( c );
1658
1659 brw_MOV( p, param0, src0 );
1660 brw_MOV( p, param1, src1 );
1661 brw_MOV( p, param2, src2 );
1662
1663 invoke_subroutine( c, SUB_NOISE3, noise3_sub );
1664
1665 /* Fill in the result: */
1666 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
1667 for (i = 0 ; i < 4; i++) {
1668 if (mask & (1<<i)) {
1669 dst = get_dst_reg(c, inst, i, 1);
1670 brw_MOV( p, dst, param0 );
1671 }
1672 }
1673 if( inst->SaturateMode == SATURATE_ZERO_ONE )
1674 brw_set_saturate( p, 0 );
1675
1676 release_tmps( c, mark );
1677 }
1678
1679 /**
1680 * For the four-dimensional case, the little micro-optimisation benefits
1681 * we obtain by unrolling all the loops aren't worth the massive bloat it
1682 * now causes. Instead, we loop twice around performing a similar operation
1683 * to noise3, once for the w=0 cube and once for the w=1, with a bit more
1684 * code to glue it all together.
1685 */
1686 static void noise4_sub( struct brw_wm_compile *c )
1687 {
1688 struct brw_compile *p = &c->func;
1689 struct brw_reg param[ 4 ],
1690 x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
1691 w0, /* noise for the w=0 cube */
1692 floors[ 2 ], /* integer coordinates of base corner of hypercube */
1693 interp[ 4 ], /* interpolation coefficients */
1694 t, tmp[ 8 ], /* float temporaries */
1695 itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
1696 wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
1697 int i, j;
1698 int mark = mark_tmps( c );
1699 GLuint loop, origin;
1700
1701 x0y0 = alloc_tmp( c );
1702 x0y1 = alloc_tmp( c );
1703 x1y0 = alloc_tmp( c );
1704 x1y1 = alloc_tmp( c );
1705 t = alloc_tmp( c );
1706 w0 = alloc_tmp( c );
1707 floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1708 floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD );
1709
1710 for( i = 0; i < 4; i++ ) {
1711 param[ i ] = lookup_tmp( c, mark - 5 + i );
1712 interp[ i ] = alloc_tmp( c );
1713 }
1714
1715 for( i = 0; i < 8; i++ ) {
1716 tmp[ i ] = alloc_tmp( c );
1717 itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
1718 wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
1719 }
1720
1721 brw_set_access_mode( p, BRW_ALIGN_1 );
1722
1723 /* We only want 16 bits of precision from the integral part of each
1724 co-ordinate, but unfortunately the RNDD semantics would saturate
1725 at 16 bits if we performed the operation directly to a 16-bit
1726 destination. Therefore, we round to 32-bit temporaries where
1727 appropriate, and then store only the lower 16 bits. */
1728 brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] );
1729 brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] );
1730 brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] );
1731 brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] );
1732 brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) );
1733 brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) );
1734
1735 /* Modify the flag register here, because the side effect is useful
1736 later (see below). We know for certain that all flags will be
1737 cleared, since the FRC instruction cannot possibly generate
1738 negative results. Even for exceptional inputs (infinities, denormals,
1739 NaNs), the architecture guarantees that the L conditional is false. */
1740 brw_set_conditionalmod( p, BRW_CONDITIONAL_L );
1741 brw_FRC( p, param[ 0 ], param[ 0 ] );
1742 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1743 for( i = 1; i < 4; i++ )
1744 brw_FRC( p, param[ i ], param[ i ] );
1745
1746 /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first
1747 of all. */
1748 for( i = 0; i < 4; i++ )
1749 brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) );
1750 for( i = 0; i < 4; i++ )
1751 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) );
1752 for( i = 0; i < 4; i++ )
1753 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1754 for( i = 0; i < 4; i++ )
1755 brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) );
1756 for( j = 0; j < 3; j++ )
1757 for( i = 0; i < 4; i++ )
1758 brw_MUL( p, interp[ i ], interp[ i ], param[ i ] );
1759
1760 /* Mark the current address, as it will be a jump destination. The
1761 following code will be executed twice: first, with the flag
1762 register clear indicating the w=0 case, and second with flags
1763 set for w=1. */
1764 loop = p->nr_insn;
1765
1766 /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
1767 be hashed. Since we have only 16 bits of precision in the hash, we
1768 must be careful about thorough mixing to maintain entropy as we
1769 squash the input vector into a small scalar. */
1770 brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ),
1771 brw_imm_uw( 0xBC8F ) );
1772 brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ),
1773 brw_imm_uw( 0xD0BD ) );
1774 brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ),
1775 brw_imm_uw( 0x9B93 ) );
1776 brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ),
1777 brw_imm_uw( 0xA359 ) );
1778 brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
1779 brw_imm_uw( 0xBC8F ) );
1780
1781 /* Temporarily disable the execution mask while we work with ExecSize=16
1782 channels (the mask is set for ExecSize=8 and is probably incorrect).
1783 Although this might cause execution of unwanted channels, the code
1784 writes only to temporary registers and has no side effects, so
1785 disabling the mask is harmless. */
1786 brw_push_insn_state( p );
1787 brw_set_mask_control( p, BRW_MASK_DISABLE );
1788 brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
1789 brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
1790 brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
1791
1792 /* We're now ready to perform the hashing. The eight hashes are
1793 interleaved for performance. The hash function used is
1794 designed to rapidly achieve avalanche and require only 16x16
1795 bit multiplication, and 8-bit swizzles (which we get for
1796 free). */
1797 for( i = 0; i < 4; i++ )
1798 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
1799 for( i = 0; i < 4; i++ )
1800 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1801 odd_bytes( wtmp[ i ] ) );
1802 for( i = 0; i < 4; i++ )
1803 brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
1804 for( i = 0; i < 4; i++ )
1805 brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
1806 odd_bytes( wtmp[ i ] ) );
1807 brw_pop_insn_state( p );
1808
1809 /* Now we want to initialise the four rear gradients based on the
1810 hashes. Format conversion from signed integer to float leaves
1811 everything scaled too high by a factor of pow( 2, 15 ), but
1812 we correct for that right at the end. */
1813 /* x component */
1814 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1815 brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
1816 brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
1817 brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
1818 brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
1819
1820 brw_push_insn_state( p );
1821 brw_set_mask_control( p, BRW_MASK_DISABLE );
1822 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1823 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1824 brw_pop_insn_state( p );
1825
1826 brw_MUL( p, x1y0, x1y0, t );
1827 brw_MUL( p, x1y1, x1y1, t );
1828 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1829 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1830 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1831
1832 /* y component */
1833 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1834 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1835 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1836 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1837
1838 brw_push_insn_state( p );
1839 brw_set_mask_control( p, BRW_MASK_DISABLE );
1840 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1841 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1842 brw_pop_insn_state( p );
1843
1844 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1845 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1846 /* prepare t for the w component (used below): w the first time through
1847 the loop; w - 1 the second time) */
1848 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1849 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1850 p->current->header.predicate_inverse = 1;
1851 brw_MOV( p, t, param[ 3 ] );
1852 p->current->header.predicate_inverse = 0;
1853 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1854 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1855 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1856
1857 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1858 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1859 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1860 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1861
1862 /* z component */
1863 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1864 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1865 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1866 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1867
1868 brw_push_insn_state( p );
1869 brw_set_mask_control( p, BRW_MASK_DISABLE );
1870 brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) );
1871 brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) );
1872 brw_pop_insn_state( p );
1873
1874 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] );
1875 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] );
1876 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] );
1877 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] );
1878
1879 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1880 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1881 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1882 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1883
1884 /* w component */
1885 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
1886 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
1887 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
1888 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
1889
1890 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1891 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1892 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1893 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1894 brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) );
1895
1896 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1897 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1898 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1899 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1900
1901 /* Here we interpolate in the y dimension... */
1902 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
1903 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
1904 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
1905 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
1906 brw_ADD( p, x0y0, x0y0, x0y1 );
1907 brw_ADD( p, x1y0, x1y0, x1y1 );
1908
1909 /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
1910 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
1911 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
1912 brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
1913
1914 /* Now do the same thing for the front four gradients... */
1915 /* x component */
1916 brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
1917 brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
1918 brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
1919 brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
1920
1921 brw_push_insn_state( p );
1922 brw_set_mask_control( p, BRW_MASK_DISABLE );
1923 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1924 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1925 brw_pop_insn_state( p );
1926
1927 brw_MUL( p, x1y0, x1y0, t );
1928 brw_MUL( p, x1y1, x1y1, t );
1929 brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) );
1930 brw_MUL( p, x0y0, x0y0, param[ 0 ] );
1931 brw_MUL( p, x0y1, x0y1, param[ 0 ] );
1932
1933 /* y component */
1934 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1935 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1936 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1937 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1938
1939 brw_push_insn_state( p );
1940 brw_set_mask_control( p, BRW_MASK_DISABLE );
1941 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1942 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1943 brw_pop_insn_state( p );
1944
1945 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1946 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1947 brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) );
1948 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] );
1949 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] );
1950
1951 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1952 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1953 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1954 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1955
1956 /* z component */
1957 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1958 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1959 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1960 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1961
1962 brw_push_insn_state( p );
1963 brw_set_mask_control( p, BRW_MASK_DISABLE );
1964 brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) );
1965 brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) );
1966 brw_pop_insn_state( p );
1967
1968 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1969 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1970 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1971 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1972 /* prepare t for the w component (used below): w the first time through
1973 the loop; w - 1 the second time) */
1974 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
1975 brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) );
1976 p->current->header.predicate_inverse = 1;
1977 brw_MOV( p, t, param[ 3 ] );
1978 p->current->header.predicate_inverse = 0;
1979 brw_set_predicate_control( p, BRW_PREDICATE_NONE );
1980
1981 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1982 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1983 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
1984 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
1985
1986 /* w component */
1987 brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
1988 brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
1989 brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
1990 brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
1991
1992 brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
1993 brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
1994 brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
1995 brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
1996
1997 brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
1998 brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
1999 brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
2000 brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
2001
2002 /* Interpolate in the y dimension: */
2003 brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
2004 brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
2005 brw_MUL( p, x0y1, x0y1, interp[ 1 ] );
2006 brw_MUL( p, x1y1, x1y1, interp[ 1 ] );
2007 brw_ADD( p, x0y0, x0y0, x0y1 );
2008 brw_ADD( p, x1y0, x1y0, x1y1 );
2009
2010 /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
2011 time put the front face in tmp[ 1 ] and we're nearly there... */
2012 brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
2013 brw_MUL( p, x1y0, x1y0, interp[ 0 ] );
2014 brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
2015
2016 /* Another interpolation, in the z dimension: */
2017 brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
2018 brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] );
2019 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
2020
2021 /* Exit the loop if we've computed both cubes... */
2022 origin = p->nr_insn;
2023 brw_push_insn_state( p );
2024 brw_set_predicate_control( p, BRW_PREDICATE_NORMAL );
2025 brw_set_mask_control( p, BRW_MASK_DISABLE );
2026 brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) );
2027 brw_pop_insn_state( p );
2028
2029 /* Save the result for the w=0 case, and increment the w coordinate: */
2030 brw_MOV( p, w0, tmp[ 0 ] );
2031 brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ),
2032 brw_imm_uw( 1 ) );
2033
2034 /* Loop around for the other cube. Explicitly set the flag register
2035 (unfortunately we must spend an extra instruction to do this: we
2036 can't rely on a side effect of the previous MOV or ADD because
2037 conditional modifiers which are normally true might be false in
2038 exceptional circumstances, e.g. given a NaN input; the add to
2039 brw_ip_reg() is not suitable because the IP is not an 8-vector). */
2040 brw_push_insn_state( p );
2041 brw_set_mask_control( p, BRW_MASK_DISABLE );
2042 brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) );
2043 brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
2044 brw_imm_d( ( loop - p->nr_insn ) << 4 ) );
2045 brw_pop_insn_state( p );
2046
2047 /* Patch the previous conditional branch now that we know the
2048 destination address. */
2049 brw_set_src1( p->store + origin,
2050 brw_imm_d( ( p->nr_insn - origin ) << 4 ) );
2051
2052 /* The very last interpolation. */
2053 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) );
2054 brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] );
2055 brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 );
2056
2057 /* scale by pow( 2, -15 ), as described above */
2058 brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
2059
2060 release_tmps( c, mark );
2061 }
2062
2063 static void emit_noise4( struct brw_wm_compile *c,
2064 struct prog_instruction *inst )
2065 {
2066 struct brw_compile *p = &c->func;
2067 struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
2068 GLuint mask = inst->DstReg.WriteMask;
2069 int i;
2070 int mark = mark_tmps( c );
2071
2072 assert( mark == 0 );
2073
2074 src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
2075 src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
2076 src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
2077 src3 = get_src_reg( c, inst->SrcReg, 3, 1 );
2078
2079 param0 = alloc_tmp( c );
2080 param1 = alloc_tmp( c );
2081 param2 = alloc_tmp( c );
2082 param3 = alloc_tmp( c );
2083
2084 brw_MOV( p, param0, src0 );
2085 brw_MOV( p, param1, src1 );
2086 brw_MOV( p, param2, src2 );
2087 brw_MOV( p, param3, src3 );
2088
2089 invoke_subroutine( c, SUB_NOISE4, noise4_sub );
2090
2091 /* Fill in the result: */
2092 brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
2093 for (i = 0 ; i < 4; i++) {
2094 if (mask & (1<<i)) {
2095 dst = get_dst_reg(c, inst, i, 1);
2096 brw_MOV( p, dst, param0 );
2097 }
2098 }
2099 if( inst->SaturateMode == SATURATE_ZERO_ONE )
2100 brw_set_saturate( p, 0 );
2101
2102 release_tmps( c, mark );
2103 }
2104
2105 static void emit_wpos_xy(struct brw_wm_compile *c,
2106 struct prog_instruction *inst)
2107 {
2108 struct brw_compile *p = &c->func;
2109 GLuint mask = inst->DstReg.WriteMask;
2110 struct brw_reg src0[2], dst[2];
2111
2112 dst[0] = get_dst_reg(c, inst, 0, 1);
2113 dst[1] = get_dst_reg(c, inst, 1, 1);
2114
2115 src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
2116 src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
2117
2118 /* Calculate the pixel offset from window bottom left into destination
2119 * X and Y channels.
2120 */
2121 if (mask & WRITEMASK_X) {
2122 /* X' = X - origin_x */
2123 brw_ADD(p,
2124 dst[0],
2125 retype(src0[0], BRW_REGISTER_TYPE_W),
2126 brw_imm_d(0 - c->key.origin_x));
2127 }
2128
2129 if (mask & WRITEMASK_Y) {
2130 /* Y' = height - (Y - origin_y) = height + origin_y - Y */
2131 brw_ADD(p,
2132 dst[1],
2133 negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
2134 brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
2135 }
2136 }
2137
2138 /* TODO
2139 BIAS on SIMD8 not workind yet...
2140 */
2141 static void emit_txb(struct brw_wm_compile *c,
2142 struct prog_instruction *inst)
2143 {
2144 struct brw_compile *p = &c->func;
2145 struct brw_reg dst[4], src[4], payload_reg;
2146 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2147
2148 GLuint i;
2149 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2150 for (i = 0; i < 4; i++)
2151 dst[i] = get_dst_reg(c, inst, i, 1);
2152 for (i = 0; i < 4; i++)
2153 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2154
2155 switch (inst->TexSrcTarget) {
2156 case TEXTURE_1D_INDEX:
2157 brw_MOV(p, brw_message_reg(2), src[0]);
2158 brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
2159 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2160 break;
2161 case TEXTURE_2D_INDEX:
2162 case TEXTURE_RECT_INDEX:
2163 brw_MOV(p, brw_message_reg(2), src[0]);
2164 brw_MOV(p, brw_message_reg(3), src[1]);
2165 brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
2166 break;
2167 default:
2168 brw_MOV(p, brw_message_reg(2), src[0]);
2169 brw_MOV(p, brw_message_reg(3), src[1]);
2170 brw_MOV(p, brw_message_reg(4), src[2]);
2171 break;
2172 }
2173 brw_MOV(p, brw_message_reg(5), src[3]);
2174 brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
2175 brw_SAMPLE(p,
2176 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2177 1,
2178 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2179 unit + MAX_DRAW_BUFFERS, /* surface */
2180 unit, /* sampler */
2181 inst->DstReg.WriteMask,
2182 BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
2183 4,
2184 4,
2185 0);
2186 }
2187
2188 static void emit_tex(struct brw_wm_compile *c,
2189 struct prog_instruction *inst)
2190 {
2191 struct brw_compile *p = &c->func;
2192 struct brw_reg dst[4], src[4], payload_reg;
2193 GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
2194
2195 GLuint msg_len;
2196 GLuint i, nr;
2197 GLuint emit;
2198 GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
2199
2200 payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
2201
2202 for (i = 0; i < 4; i++)
2203 dst[i] = get_dst_reg(c, inst, i, 1);
2204 for (i = 0; i < 4; i++)
2205 src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
2206
2207
2208 switch (inst->TexSrcTarget) {
2209 case TEXTURE_1D_INDEX:
2210 emit = WRITEMASK_X;
2211 nr = 1;
2212 break;
2213 case TEXTURE_2D_INDEX:
2214 case TEXTURE_RECT_INDEX:
2215 emit = WRITEMASK_XY;
2216 nr = 2;
2217 break;
2218 default:
2219 emit = WRITEMASK_XYZ;
2220 nr = 3;
2221 break;
2222 }
2223 msg_len = 1;
2224
2225 for (i = 0; i < nr; i++) {
2226 static const GLuint swz[4] = {0,1,2,2};
2227 if (emit & (1<<i))
2228 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
2229 else
2230 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
2231 msg_len += 1;
2232 }
2233
2234 if (shadow) {
2235 brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
2236 brw_MOV(p, brw_message_reg(6), src[2]);
2237 }
2238
2239 brw_SAMPLE(p,
2240 retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
2241 1,
2242 retype(payload_reg, BRW_REGISTER_TYPE_UW),
2243 unit + MAX_DRAW_BUFFERS, /* surface */
2244 unit, /* sampler */
2245 inst->DstReg.WriteMask,
2246 BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
2247 4,
2248 shadow ? 6 : 4,
2249 0);
2250
2251 if (shadow)
2252 brw_MOV(p, dst[3], brw_imm_f(1.0));
2253 }
2254
2255 /**
2256 * Resolve subroutine calls after code emit is done.
2257 */
2258 static void post_wm_emit( struct brw_wm_compile *c )
2259 {
2260 brw_resolve_cals(&c->func);
2261 }
2262
2263 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
2264 {
2265 #define MAX_IFSN 32
2266 #define MAX_LOOP_DEPTH 32
2267 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
2268 struct brw_instruction *inst0, *inst1;
2269 int i, if_insn = 0, loop_insn = 0;
2270 struct brw_compile *p = &c->func;
2271 struct brw_indirect stack_index = brw_indirect(0, 0);
2272
2273 c->reg_index = 0;
2274 prealloc_reg(c);
2275 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2276 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
2277
2278 for (i = 0; i < c->nr_fp_insns; i++) {
2279 struct prog_instruction *inst = &c->prog_instructions[i];
2280
2281 if (inst->CondUpdate)
2282 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
2283 else
2284 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
2285
2286 switch (inst->Opcode) {
2287 case WM_PIXELXY:
2288 emit_pixel_xy(c, inst);
2289 break;
2290 case WM_DELTAXY:
2291 emit_delta_xy(c, inst);
2292 break;
2293 case WM_PIXELW:
2294 emit_pixel_w(c, inst);
2295 break;
2296 case WM_LINTERP:
2297 emit_linterp(c, inst);
2298 break;
2299 case WM_PINTERP:
2300 emit_pinterp(c, inst);
2301 break;
2302 case WM_CINTERP:
2303 emit_cinterp(c, inst);
2304 break;
2305 case WM_WPOSXY:
2306 emit_wpos_xy(c, inst);
2307 break;
2308 case WM_FB_WRITE:
2309 emit_fb_write(c, inst);
2310 break;
2311 case OPCODE_ABS:
2312 emit_abs(c, inst);
2313 break;
2314 case OPCODE_ADD:
2315 emit_add(c, inst);
2316 break;
2317 case OPCODE_SUB:
2318 emit_sub(c, inst);
2319 break;
2320 case OPCODE_FRC:
2321 emit_frc(c, inst);
2322 break;
2323 case OPCODE_FLR:
2324 emit_flr(c, inst);
2325 break;
2326 case OPCODE_LRP:
2327 emit_lrp(c, inst);
2328 break;
2329 case OPCODE_TRUNC:
2330 emit_trunc(c, inst);
2331 break;
2332 case OPCODE_MOV:
2333 emit_mov(c, inst);
2334 break;
2335 case OPCODE_DP3:
2336 emit_dp3(c, inst);
2337 break;
2338 case OPCODE_DP4:
2339 emit_dp4(c, inst);
2340 break;
2341 case OPCODE_XPD:
2342 emit_xpd(c, inst);
2343 break;
2344 case OPCODE_DPH:
2345 emit_dph(c, inst);
2346 break;
2347 case OPCODE_RCP:
2348 emit_rcp(c, inst);
2349 break;
2350 case OPCODE_RSQ:
2351 emit_rsq(c, inst);
2352 break;
2353 case OPCODE_SIN:
2354 emit_sin(c, inst);
2355 break;
2356 case OPCODE_COS:
2357 emit_cos(c, inst);
2358 break;
2359 case OPCODE_EX2:
2360 emit_ex2(c, inst);
2361 break;
2362 case OPCODE_LG2:
2363 emit_lg2(c, inst);
2364 break;
2365 case OPCODE_MAX:
2366 emit_max(c, inst);
2367 break;
2368 case OPCODE_MIN:
2369 emit_min(c, inst);
2370 break;
2371 case OPCODE_DDX:
2372 emit_ddx(c, inst);
2373 break;
2374 case OPCODE_DDY:
2375 emit_ddy(c, inst);
2376 break;
2377 case OPCODE_SLT:
2378 emit_slt(c, inst);
2379 break;
2380 case OPCODE_SLE:
2381 emit_sle(c, inst);
2382 break;
2383 case OPCODE_SGT:
2384 emit_sgt(c, inst);
2385 break;
2386 case OPCODE_SGE:
2387 emit_sge(c, inst);
2388 break;
2389 case OPCODE_SEQ:
2390 emit_seq(c, inst);
2391 break;
2392 case OPCODE_SNE:
2393 emit_sne(c, inst);
2394 break;
2395 case OPCODE_MUL:
2396 emit_mul(c, inst);
2397 break;
2398 case OPCODE_POW:
2399 emit_pow(c, inst);
2400 break;
2401 case OPCODE_MAD:
2402 emit_mad(c, inst);
2403 break;
2404 case OPCODE_NOISE1:
2405 emit_noise1(c, inst);
2406 break;
2407 case OPCODE_NOISE2:
2408 emit_noise2(c, inst);
2409 break;
2410 case OPCODE_NOISE3:
2411 emit_noise3(c, inst);
2412 break;
2413 case OPCODE_NOISE4:
2414 emit_noise4(c, inst);
2415 break;
2416 case OPCODE_TEX:
2417 emit_tex(c, inst);
2418 break;
2419 case OPCODE_TXB:
2420 emit_txb(c, inst);
2421 break;
2422 case OPCODE_KIL_NV:
2423 emit_kil(c);
2424 break;
2425 case OPCODE_IF:
2426 assert(if_insn < MAX_IFSN);
2427 if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
2428 break;
2429 case OPCODE_ELSE:
2430 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
2431 break;
2432 case OPCODE_ENDIF:
2433 assert(if_insn > 0);
2434 brw_ENDIF(p, if_inst[--if_insn]);
2435 break;
2436 case OPCODE_BGNSUB:
2437 brw_save_label(p, inst->Comment, p->nr_insn);
2438 break;
2439 case OPCODE_ENDSUB:
2440 /* no-op */
2441 break;
2442 case OPCODE_CAL:
2443 brw_push_insn_state(p);
2444 brw_set_mask_control(p, BRW_MASK_DISABLE);
2445 brw_set_access_mode(p, BRW_ALIGN_1);
2446 brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2447 brw_set_access_mode(p, BRW_ALIGN_16);
2448 brw_ADD(p, get_addr_reg(stack_index),
2449 get_addr_reg(stack_index), brw_imm_d(4));
2450 brw_save_call(&c->func, inst->Comment, p->nr_insn);
2451 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2452 brw_pop_insn_state(p);
2453 break;
2454
2455 case OPCODE_RET:
2456 brw_push_insn_state(p);
2457 brw_set_mask_control(p, BRW_MASK_DISABLE);
2458 brw_ADD(p, get_addr_reg(stack_index),
2459 get_addr_reg(stack_index), brw_imm_d(-4));
2460 brw_set_access_mode(p, BRW_ALIGN_1);
2461 brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
2462 brw_set_access_mode(p, BRW_ALIGN_16);
2463 brw_pop_insn_state(p);
2464
2465 break;
2466 case OPCODE_BGNLOOP:
2467 loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
2468 break;
2469 case OPCODE_BRK:
2470 brw_BREAK(p);
2471 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2472 break;
2473 case OPCODE_CONT:
2474 brw_CONT(p);
2475 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2476 break;
2477 case OPCODE_ENDLOOP:
2478 loop_insn--;
2479 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
2480 /* patch all the BREAK instructions from
2481 last BEGINLOOP */
2482 while (inst0 > loop_inst[loop_insn]) {
2483 inst0--;
2484 if (inst0->header.opcode == BRW_OPCODE_BREAK) {
2485 inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
2486 inst0->bits3.if_else.pop_count = 0;
2487 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
2488 inst0->bits3.if_else.jump_count = inst1 - inst0;
2489 inst0->bits3.if_else.pop_count = 0;
2490 }
2491 }
2492 break;
2493 default:
2494 _mesa_printf("unsupported IR in fragment shader %d\n",
2495 inst->Opcode);
2496 }
2497 if (inst->CondUpdate)
2498 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
2499 else
2500 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2501 }
2502 post_wm_emit(c);
2503 }
2504
2505
2506 /**
2507 * Do GPU code generation for shaders that use GLSL features such as
2508 * flow control. Other shaders will be compiled with the
2509 */
2510 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
2511 {
2512 if (INTEL_DEBUG & DEBUG_WM) {
2513 _mesa_printf("brw_wm_glsl_emit:\n");
2514 }
2515
2516 /* initial instruction translation/simplification */
2517 brw_wm_pass_fp(c);
2518
2519 /* actual code generation */
2520 brw_wm_emit_glsl(brw, c);
2521
2522 if (INTEL_DEBUG & DEBUG_WM) {
2523 brw_wm_print_program(c, "brw_wm_glsl_emit done");
2524 }
2525
2526 c->prog_data.total_grf = c->reg_index;
2527 c->prog_data.total_scratch = 0;
2528 }