4 TGSI, Tungsten Graphics Shader Instructions, is an intermediate language
5 for describing shaders. Since Gallium is inherently shaderful, shaders are
6 an important part of the API. TGSI is the only intermediate representation
9 From GL_NV_vertex_program
10 -------------------------
13 ARL - Address Register Load
17 dst.x = \lfloor src.x\rfloor
19 dst.y = \lfloor src.y\rfloor
21 dst.z = \lfloor src.z\rfloor
23 dst.w = \lfloor src.w\rfloor
39 LIT - Light Coefficients
45 dst.y = max(src.x, 0.0)
47 dst.z = (src.x > 0.0) ? max(src.y, 0.0)^{clamp(src.w, -128.0, 128.0))} : 0.0
56 dst.x = \frac{1}{src.x}
58 dst.y = \frac{1}{src.x}
60 dst.z = \frac{1}{src.x}
62 dst.w = \frac{1}{src.x}
65 RSQ - Reciprocal Square Root
69 dst.x = \frac{1}{\sqrt{|src.x|}}
71 dst.y = \frac{1}{\sqrt{|src.x|}}
73 dst.z = \frac{1}{\sqrt{|src.x|}}
75 dst.w = \frac{1}{\sqrt{|src.x|}}
78 EXP - Approximate Exponential Base 2
82 dst.x = 2^{\lfloor src.x\rfloor}
84 dst.y = src.x - \lfloor src.x\rfloor
91 LOG - Approximate Logarithm Base 2
95 dst.x = \lfloor\log_2{|src.x|}\rfloor
97 dst.y = \frac{|src.x|}{2^{\lfloor\log_2{|src.x|}\rfloor}}
99 dst.z = \log_2{|src.x|}
108 dst.x = src0.x * src1.x
110 dst.y = src0.y * src1.y
112 dst.z = src0.z * src1.z
114 dst.w = src0.w * src1.w
121 dst.x = src0.x + src1.x
123 dst.y = src0.y + src1.y
125 dst.z = src0.z + src1.z
127 dst.w = src0.w + src1.w
130 DP3 - 3-component Dot Product
134 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
136 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
138 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
140 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
143 DP4 - 4-component Dot Product
147 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
149 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
151 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
153 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
156 DST - Distance Vector
162 dst.y = src0.y * src1.y
173 dst.x = min(src0.x, src1.x)
175 dst.y = min(src0.y, src1.y)
177 dst.z = min(src0.z, src1.z)
179 dst.w = min(src0.w, src1.w)
186 dst.x = max(src0.x, src1.x)
188 dst.y = max(src0.y, src1.y)
190 dst.z = max(src0.z, src1.z)
192 dst.w = max(src0.w, src1.w)
195 SLT - Set On Less Than
199 dst.x = (src0.x < src1.x) ? 1.0 : 0.0
201 dst.y = (src0.y < src1.y) ? 1.0 : 0.0
203 dst.z = (src0.z < src1.z) ? 1.0 : 0.0
205 dst.w = (src0.w < src1.w) ? 1.0 : 0.0
208 SGE - Set On Greater Equal Than
212 dst.x = (src0.x >= src1.x) ? 1.0 : 0.0
214 dst.y = (src0.y >= src1.y) ? 1.0 : 0.0
216 dst.z = (src0.z >= src1.z) ? 1.0 : 0.0
218 dst.w = (src0.w >= src1.w) ? 1.0 : 0.0
221 MAD - Multiply And Add
225 dst.x = src0.x * src1.x + src2.x
227 dst.y = src0.y * src1.y + src2.y
229 dst.z = src0.z * src1.z + src2.z
231 dst.w = src0.w * src1.w + src2.w
238 dst.x = src0.x - src1.x
240 dst.y = src0.y - src1.y
242 dst.z = src0.z - src1.z
244 dst.w = src0.w - src1.w
247 LRP - Linear Interpolate
251 dst.x = src0.x * (src1.x - src2.x) + src2.x
253 dst.y = src0.y * (src1.y - src2.y) + src2.y
255 dst.z = src0.z * (src1.z - src2.z) + src2.z
257 dst.w = src0.w * (src1.w - src2.w) + src2.w
264 dst.x = (src2.x > 0.5) ? src0.x : src1.x
266 dst.y = (src2.y > 0.5) ? src0.y : src1.y
268 dst.z = (src2.z > 0.5) ? src0.z : src1.z
270 dst.w = (src2.w > 0.5) ? src0.w : src1.w
273 DP2A - 2-component Dot Product And Add
277 dst.x = src0.x * src1.x + src0.y * src1.y + src2.x
279 dst.y = src0.x * src1.x + src0.y * src1.y + src2.x
281 dst.z = src0.x * src1.x + src0.y * src1.y + src2.x
283 dst.w = src0.x * src1.x + src0.y * src1.y + src2.x
290 dst.x = src.x - \lfloor src.x\rfloor
292 dst.y = src.y - \lfloor src.y\rfloor
294 dst.z = src.z - \lfloor src.z\rfloor
296 dst.w = src.w - \lfloor src.w\rfloor
303 dst.x = clamp(src0.x, src1.x, src2.x)
304 dst.y = clamp(src0.y, src1.y, src2.y)
305 dst.z = clamp(src0.z, src1.z, src2.z)
306 dst.w = clamp(src0.w, src1.w, src2.w)
311 This is identical to ARL.
315 dst.x = \lfloor src.x\rfloor
317 dst.y = \lfloor src.y\rfloor
319 dst.z = \lfloor src.z\rfloor
321 dst.w = \lfloor src.w\rfloor
334 EX2 - Exponential Base 2
347 LG2 - Logarithm Base 2
351 dst.x = \log_2{src.x}
353 dst.y = \log_2{src.x}
355 dst.z = \log_2{src.x}
357 dst.w = \log_2{src.x}
364 dst.x = src0.x^{src1.x}
366 dst.y = src0.x^{src1.x}
368 dst.z = src0.x^{src1.x}
370 dst.w = src0.x^{src1.x}
372 1.3.15 XPD - Cross Product
376 dst.x = src0.y * src1.z - src1.y * src0.z
377 dst.y = src0.z * src1.x - src1.z * src0.x
378 dst.z = src0.x * src1.y - src1.x * src0.y
395 1.4.2 RCC - Reciprocal Clamped
399 dst.x = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
400 dst.y = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
401 dst.z = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
402 dst.w = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
405 1.4.3 DPH - Homogeneous Dot Product
409 dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
410 dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
411 dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
412 dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
428 1.5.2 DDX - Derivative Relative To X
432 dst.x = partialx(src.x)
433 dst.y = partialx(src.y)
434 dst.z = partialx(src.z)
435 dst.w = partialx(src.w)
438 1.5.3 DDY - Derivative Relative To Y
442 dst.x = partialy(src.x)
443 dst.y = partialy(src.y)
444 dst.z = partialy(src.z)
445 dst.w = partialy(src.w)
448 1.5.7 KILP - Predicated Discard
455 1.5.10 PK2H - Pack Two 16-bit Floats
460 1.5.11 PK2US - Pack Two Unsigned 16-bit Scalars
465 1.5.12 PK4B - Pack Four Signed 8-bit Scalars
470 1.5.13 PK4UB - Pack Four Unsigned 8-bit Scalars
475 1.5.15 RFL - Reflection Vector
479 dst.x = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.x - src1.x
480 dst.y = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.y - src1.y
481 dst.z = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.z - src1.z
484 Considered for removal.
487 1.5.16 SEQ - Set On Equal
491 dst.x = (src0.x == src1.x) ? 1.0 : 0.0
492 dst.y = (src0.y == src1.y) ? 1.0 : 0.0
493 dst.z = (src0.z == src1.z) ? 1.0 : 0.0
494 dst.w = (src0.w == src1.w) ? 1.0 : 0.0
497 1.5.17 SFL - Set On False
506 Considered for removal.
508 1.5.18 SGT - Set On Greater Than
512 dst.x = (src0.x > src1.x) ? 1.0 : 0.0
513 dst.y = (src0.y > src1.y) ? 1.0 : 0.0
514 dst.z = (src0.z > src1.z) ? 1.0 : 0.0
515 dst.w = (src0.w > src1.w) ? 1.0 : 0.0
531 1.5.20 SLE - Set On Less Equal Than
535 dst.x = (src0.x <= src1.x) ? 1.0 : 0.0
536 dst.y = (src0.y <= src1.y) ? 1.0 : 0.0
537 dst.z = (src0.z <= src1.z) ? 1.0 : 0.0
538 dst.w = (src0.w <= src1.w) ? 1.0 : 0.0
541 1.5.21 SNE - Set On Not Equal
545 dst.x = (src0.x != src1.x) ? 1.0 : 0.0
546 dst.y = (src0.y != src1.y) ? 1.0 : 0.0
547 dst.z = (src0.z != src1.z) ? 1.0 : 0.0
548 dst.w = (src0.w != src1.w) ? 1.0 : 0.0
551 1.5.22 STR - Set On True
561 1.5.23 TEX - Texture Lookup
566 1.5.24 TXD - Texture Lookup with Derivatives
571 1.5.25 TXP - Projective Texture Lookup
576 1.5.26 UP2H - Unpack Two 16-Bit Floats
580 Considered for removal.
582 1.5.27 UP2US - Unpack Two Unsigned 16-Bit Scalars
586 Considered for removal.
588 1.5.28 UP4B - Unpack Four Signed 8-Bit Values
592 Considered for removal.
594 1.5.29 UP4UB - Unpack Four Unsigned 8-Bit Scalars
598 Considered for removal.
600 1.5.30 X2D - 2D Coordinate Transformation
604 dst.x = src0.x + src1.x * src2.x + src1.y * src2.y
605 dst.y = src0.y + src1.x * src2.z + src1.y * src2.w
606 dst.z = src0.x + src1.x * src2.x + src1.y * src2.y
607 dst.w = src0.y + src1.x * src2.z + src1.y * src2.w
609 Considered for removal.
612 1.6 GL_NV_vertex_program2
613 --------------------------
616 1.6.1 ARA - Address Register Add
620 Considered for removal.
622 1.6.2 ARR - Address Register Load With Round
636 Considered for removal.
638 1.6.4 CAL - Subroutine Call
644 1.6.5 RET - Subroutine Call Return
648 Potential restrictions:
649 * Only occurs at end of function.
655 dst.x = (src.x > 0.0) ? 1.0 : (src.x < 0.0) ? -1.0 : 0.0
656 dst.y = (src.y > 0.0) ? 1.0 : (src.y < 0.0) ? -1.0 : 0.0
657 dst.z = (src.z > 0.0) ? 1.0 : (src.z < 0.0) ? -1.0 : 0.0
658 dst.w = (src.w > 0.0) ? 1.0 : (src.w < 0.0) ? -1.0 : 0.0
665 dst.x = (src0.x < 0.0) ? src1.x : src2.x
666 dst.y = (src0.y < 0.0) ? src1.y : src2.y
667 dst.z = (src0.z < 0.0) ? src1.z : src2.z
668 dst.w = (src0.w < 0.0) ? src1.w : src2.w
671 1.8.2 KIL - Conditional Discard
675 if (src.x < 0.0 || src.y < 0.0 || src.z < 0.0 || src.w < 0.0)
693 1.8.4 TXB - Texture Lookup With Bias
698 1.9.1 NRM - 3-component Vector Normalise
702 dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z)
703 dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z)
704 dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z)
712 dst.x = src0.x / src1.x
713 dst.y = src0.y / src1.y
714 dst.z = src0.z / src1.z
715 dst.w = src0.w / src1.w
718 1.9.3 DP2 - 2-component Dot Product
722 dst.x = src0.x * src1.x + src0.y * src1.y
723 dst.y = src0.x * src1.x + src0.y * src1.y
724 dst.z = src0.x * src1.x + src0.y * src1.y
725 dst.w = src0.x * src1.x + src0.y * src1.y
728 1.9.5 TXL - Texture Lookup With LOD
743 1.9.8 BGNFOR - Begin a For-Loop
750 pc = [matching ENDFOR] + 1
753 Note: The destination must be a loop register.
754 The source must be a constant register.
756 Considered for cleanup / removal.
769 1.9.11 ENDIF - End If
774 1.9.12 ENDFOR - End a For-Loop
776 dst.x = dst.x + dst.z
780 pc = [matching BGNFOR instruction] + 1
783 Note: The destination must be a loop register.
785 Considered for cleanup / removal.
787 1.9.13 ENDREP - End Repeat
792 1.10.1 PUSHA - Push Address Register On Stack
799 Considered for cleanup / removal.
801 1.10.2 POPA - Pop Address Register From Stack
808 Considered for cleanup / removal.
811 1.11 GL_NV_gpu_program4
812 ------------------------
814 Support for these opcodes indicated by a special pipe capability bit (TBD).
820 dst.x = \lceil src.x\rceil
822 dst.y = \lceil src.y\rceil
824 dst.z = \lceil src.z\rceil
826 dst.w = \lceil src.w\rceil
829 1.11.2 I2F - Integer To Float
833 dst.x = (float) src.x
834 dst.y = (float) src.y
835 dst.z = (float) src.z
836 dst.w = (float) src.w
839 1.11.3 NOT - Bitwise Not
849 1.11.4 TRUNC - Truncate
859 1.11.5 SHL - Shift Left
863 dst.x = src0.x << src1.x
864 dst.y = src0.y << src1.x
865 dst.z = src0.z << src1.x
866 dst.w = src0.w << src1.x
869 1.11.6 SHR - Shift Right
873 dst.x = src0.x >> src1.x
874 dst.y = src0.y >> src1.x
875 dst.z = src0.z >> src1.x
876 dst.w = src0.w >> src1.x
879 1.11.7 AND - Bitwise And
883 dst.x = src0.x & src1.x
884 dst.y = src0.y & src1.y
885 dst.z = src0.z & src1.z
886 dst.w = src0.w & src1.w
889 1.11.8 OR - Bitwise Or
893 dst.x = src0.x | src1.x
894 dst.y = src0.y | src1.y
895 dst.z = src0.z | src1.z
896 dst.w = src0.w | src1.w
903 dst.x = src0.x % src1.x
904 dst.y = src0.y % src1.y
905 dst.z = src0.z % src1.z
906 dst.w = src0.w % src1.w
909 1.11.10 XOR - Bitwise Xor
913 dst.x = src0.x ^ src1.x
914 dst.y = src0.y ^ src1.y
915 dst.z = src0.z ^ src1.z
916 dst.w = src0.w ^ src1.w
919 SAD - Sum Of Absolute Differences
923 dst.x = |src0.x - src1.x| + src2.x
925 dst.y = |src0.y - src1.y| + src2.y
927 dst.z = |src0.z - src1.z| + src2.z
929 dst.w = |src0.w - src1.w| + src2.w
932 1.11.12 TXF - Texel Fetch
937 1.11.13 TXQ - Texture Size Query
942 1.11.14 CONT - Continue
947 1.12 GL_NV_geometry_program4
948 -----------------------------
956 1.12.2 ENDPRIM - End Primitive
965 1.13.1 BGNLOOP - Begin a Loop
970 1.13.2 BGNSUB - Begin Subroutine
975 1.13.3 ENDLOOP - End a Loop
980 1.13.4 ENDSUB - End Subroutine
986 1.13.10 NOP - No Operation
992 1.16.7 NRM4 - 4-component Vector Normalise
996 dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
997 dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
998 dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
999 dst.w = src.w / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
1006 1.17.2 CALLNZ - Subroutine Call If Not Zero
1016 1.17.5 BREAKC - Break Conditional
1021 2 Explanation of symbols used
1022 ==============================
1029 :math:`|x|` Absolute value of `x`.
1031 :math:`\lceil x \rceil` Ceiling of `x`.
1033 clamp(x,y,z) Clamp x between y and z.
1034 (x < y) ? y : (x > z) ? z : x
1036 :math:`\lfloor x\rfloor` Floor of `x`.
1038 :math:`\log_2{x}` Logarithm of `x`, base 2.
1040 max(x,y) Maximum of x and y.
1043 min(x,y) Minimum of x and y.
1046 partialx(x) Derivative of x relative to fragment's X.
1048 partialy(x) Derivative of x relative to fragment's Y.
1050 pop() Pop from stack.
1052 :math:`x^y` `x` to the power `y`.
1054 push(x) Push x on stack.
1058 trunc(x) Truncate x.
1065 discard Discard fragment.
1067 dst First destination register.
1069 dst0 First destination register.
1073 src First source register.
1075 src0 First source register.
1077 src1 Second source register.
1079 src2 Third source register.
1081 target Label of target instruction.
1088 3.1 Declaration Semantic
1089 -------------------------
1092 Follows Declaration token if Semantic bit is set.
1094 Since its purpose is to link a shader with other stages of the pipeline,
1095 it is valid to follow only those Declaration tokens that declare a register
1096 either in INPUT or OUTPUT file.
1098 SemanticName field contains the semantic name of the register being declared.
1099 There is no default value.
1101 SemanticIndex is an optional subscript that can be used to distinguish
1102 different register declarations with the same semantic name. The default value
1105 The meanings of the individual semantic names are explained in the following
1111 Valid only in a fragment shader INPUT declaration.
1113 FACE.x is negative when the primitive is back facing. FACE.x is positive
1114 when the primitive is front facing.