From 5811ed87d732101ab8cfbd087bc99d8c6c963f30 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Jos=C3=A9=20Fonseca?= <jfonseca@vmware.com>
Date: Sat, 22 Aug 2009 22:26:55 +0100
Subject: [PATCH] llvmpipe: Add a bunch of comments.

Description/rationale/to-do items, while I still remember them...
---
 src/gallium/drivers/llvmpipe/Makefile         |   2 +-
 src/gallium/drivers/llvmpipe/README           |  71 ++++++++----
 src/gallium/drivers/llvmpipe/SConscript       |   2 +-
 .../drivers/llvmpipe/lp_bld_blend_aos.c       |  11 +-
 ...p_bld_logicop.c => lp_bld_blend_logicop.c} |   8 ++
 .../drivers/llvmpipe/lp_bld_blend_soa.c       |  36 +++++-
 src/gallium/drivers/llvmpipe/lp_bld_conv.c    |  93 ++++++++++++----
 src/gallium/drivers/llvmpipe/lp_bld_depth.c   |  32 ++++++
 src/gallium/drivers/llvmpipe/lp_bld_depth.h   |   3 -
 src/gallium/drivers/llvmpipe/lp_bld_intr.c    |  19 ++--
 src/gallium/drivers/llvmpipe/lp_bld_intr.h    |   3 +
 src/gallium/drivers/llvmpipe/lp_bld_logic.c   |   7 ++
 src/gallium/drivers/llvmpipe/lp_bld_swizzle.c |   7 ++
 src/gallium/drivers/llvmpipe/lp_bld_swizzle.h |   2 +-
 src/gallium/drivers/llvmpipe/lp_bld_tgsi.h    |   7 ++
 .../drivers/llvmpipe/lp_bld_tgsi_soa.c        |  10 ++
 src/gallium/drivers/llvmpipe/lp_state_fs.c    | 103 ++++++++++++++++--
 17 files changed, 346 insertions(+), 70 deletions(-)
 rename src/gallium/drivers/llvmpipe/{lp_bld_logicop.c => lp_bld_blend_logicop.c} (96%)

diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index c6c8754dada..102227f0f8b 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -7,6 +7,7 @@ C_SOURCES = \
 	lp_bld_alpha.c \
 	lp_bld_arit.c \
 	lp_bld_blend_aos.c \
+	lp_bld_blend_logicop.c \
 	lp_bld_blend_soa.c \
 	lp_bld_const.c \
 	lp_bld_conv.c \
@@ -19,7 +20,6 @@ C_SOURCES = \
 	lp_bld_load.c \
 	lp_bld_store.c \
 	lp_bld_logic.c \
-	lp_bld_logicop.c \
 	lp_bld_swizzle.c \
 	lp_bld_tgsi_soa.c \
 	lp_bld_type.c \
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 677352eaa1d..498d21dea6c 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -6,31 +6,40 @@ Status
 
 Done so far is:
 
-- TGSI -> LLVM fragment shader translation
-  - same level of support as the TGSI SSE2 exec machine
-  - texture sampling via an intrinsic call
-  - done in SoA
-  - input interpolation also code generated
-
-- blend -> LLVM (including logic ops)
-  - SoA and AoS, but only the former used
-
-- code is generic
-  - intermediates can be vectors of floats, ubytes, fixed point, etc, and of
-    any width and length
-  - not all operations are implemented for these types yet though
+ - the whole fragment pipeline is code generated in a single function
+ 
+   - depth testing
+ 
+   - fragment shader TGSI translation
+     - same level of support as the TGSI SSE2 exec machine, with the exception
+       we don't fallback to TGSI interpretation when an unsupported opcode is
+       found, but just ignore it
+     - texture sampling via an intrinsic call
+     - done in SoA layout
+     - input interpolation also code generated
+ 
+   - alpha testing
+ 
+   - blend (including logic ops)
+     - both in SoA and AoS layouts, but only the former used for now
+ 
+ - code is generic
+   - intermediates can be vectors of floats, ubytes, fixed point, etc, and of
+     any width and length
+   - not all operations are implemented for these types yet though
 
 Most mesa/progs/demos/* work. Speed is on par with Keith's softpipe-opt branch,
 which includes hand written fast implementations for common cases.
 
 To do (probably by this order):
-- code generate the rest of the fragment pipeline, namely the
-  depth/alpha/stencil state
-- concatenate the fragment pipeline (shader + depth/stencil/alpha + blend) in a
-  single function
-- code generate texture sampling
-- translate TGSI control flow instructions
-- code generate the triangle setup and rasterization
+
+ - code generate stipple and stencil testing
+
+ - code generate texture sampling
+
+ - translate TGSI control flow instructions, and all other remaining opcodes
+
+ - code generate the triangle setup and rasterization
 
 
 Requirements
@@ -70,7 +79,7 @@ Requirements
    instructions. This is necessary because we emit several SSE intrinsics for
    convenience. See /proc/cpuinfo to know what your CPU supports.
  
- - scons (although it should be straightforward to fix the Makefiles as well)
+ - scons
 
 
 Building
@@ -80,6 +89,12 @@ To build everything invoke scons as:
 
   scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=xlib dri=false -k
 
+Alternatively, you can build it with GNU make, if you prefer, by invoking it as
+
+  make linux-llvm
+
+but the rest of these instructions assume scons is used.
+
 
 Using
 =====
@@ -87,9 +102,12 @@ Using
 Building will create a drop-in alternative for libGL.so. To use it set the
 environment variables:
 
-  export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib:$LD_LIBRARY_PATH
   export LD_LIBRARY_PATH=$PWD/build/linux-x86_64-debug/lib:$LD_LIBRARY_PATH
 
+or
+
+  export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib:$LD_LIBRARY_PATH
+
 
 Unit testing
 ============
@@ -104,12 +122,19 @@ build/linux-???-debug/gallium/drivers/llvmpipe:
 Some of this tests can output results and benchmarks to a tab-seperated-file
 for posterior analysis, e.g.:
 
-  build/linux-x86_64/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
+  build/linux-x86_64-debug/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
 
 
 Development Notes
 =================
 
+- When looking to this code by the first time start in lp_state_fs.c, and 
+  then skim through the lp_bld_* functions called in there, and the comments
+  at the top of the lp_bld_*.c functions.  
+
+- All lp_bld_*.[ch] are isolated from the rest of the driver, and could/may be 
+  put in a standalone Gallium state -> LLVM IR translation module.
+
 - We use LLVM-C bindings for now. They are not documented, but follow the C++
   interfaces very closely, and appear to be complete enough for code
   generation. See 
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index a9501b47655..84fd8fe95ab 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -11,6 +11,7 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_bld_alpha.c',
 		'lp_bld_arit.c',
 		'lp_bld_blend_aos.c',
+		'lp_bld_blend_logicop.c',
 		'lp_bld_blend_soa.c',
 		'lp_bld_const.c',
 		'lp_bld_conv.c',
@@ -23,7 +24,6 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_bld_load.c',
 		'lp_bld_store.c',
 		'lp_bld_logic.c',
-		'lp_bld_logicop.c',
 		'lp_bld_swizzle.c',
 		'lp_bld_tgsi_soa.c',		
 		'lp_bld_type.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index 87ba4560653..c11a9398f87 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -28,7 +28,16 @@
 
 /**
  * @file
- * Blend LLVM IR generation -- AOS form.
+ * Blend LLVM IR generation -- AoS layout.
+ *
+ * AoS blending is in general much slower than SoA, but there are some cases
+ * where it might be faster. In particular, if a pixel is rendered only once
+ * then the overhead of tiling and untiling will dominate over the speedup that
+ * SoA gives. So we might want to detect such cases and fallback to AoS in the
+ * future, but for now this function is here for historical/benchmarking
+ * purposes.
+ *
+ * Run lp_blend_test after any change to this file.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logicop.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
similarity index 96%
rename from src/gallium/drivers/llvmpipe/lp_bld_logicop.c
rename to src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
index f9202d1a838..88321f62a2c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logicop.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
@@ -26,6 +26,14 @@
  **************************************************************************/
 
 
+/**
+ * @file
+ * Blend LLVM IR generation -- logic ops.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
 #include "pipe/p_state.h"
 
 #include "lp_bld_blend.h"
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
index 73516fd81b4..b92254a7d6f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
@@ -28,7 +28,41 @@
 
 /**
  * @file
- * Blend LLVM IR generation -- SoA.
+ * Blend LLVM IR generation -- SoA layout.
+ *
+ * Blending in SoA is much faster than AoS, especially when separate rgb/alpha
+ * factors/functions are used, since no channel masking/shuffling is necessary
+ * and we can achieve the full throughput of the SIMD operations. Furthermore
+ * the fragment shader output is also in SoA, so it fits nicely with the rest of
+ * the fragment pipeline.
+ *
+ * The drawback is that to be displayed the color buffer needs to be in AoS
+ * layout, so we need to tile/untile the color buffer before/after rendering.
+ * A color buffer like
+ *
+ *  R11 G11 B11 A11 R12 G12 B12 A12  R13 G13 B13 A13 R14 G14 B14 A14  ...
+ *  R21 G21 B21 A21 R22 G22 B22 A22  R23 G23 B23 A23 R24 G24 B24 A24  ...
+ *
+ *  R31 G31 B31 A31 R32 G32 B32 A32  R33 G33 B33 A33 R34 G34 B34 A34  ...
+ *  R41 G41 B41 A41 R42 G42 B42 A42  R43 G43 B43 A43 R44 G44 B44 A44  ...
+ *
+ *  ... ... ... ... ... ... ... ...  ... ... ... ... ... ... ... ...  ...
+ *
+ * will actually be stored in memory as
+ *
+ *  R11 R12 R21 R22 R13 R14 R23 R24 ... G11 G12 G21 G22 G13 G14 G23 G24 ... B11 B12 B21 B22 B13 B14 B23 B24 ... A11 A12 A21 A22 A13 A14 A23 A24 ...
+ *  R31 R32 R41 R42 R33 R34 R43 R44 ... G31 G32 G41 G42 G33 G34 G43 G44 ... B31 B32 B41 B42 B33 B34 B43 B44 ... A31 A32 A41 A42 A33 A34 A43 A44 ...
+ *  ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
+ *
+ * NOTE: Run lp_blend_test after any change to this file.
+ *
+ * You can also run lp_blend_test to obtain AoS vs SoA benchmarks. Invoking it
+ * as:
+ *
+ *  lp_blend_test -o blend.tsv
+ *
+ * will generate a tab-seperated-file with the test results and performance
+ * measurements.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.c b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
index 54d2e13d34a..3a54272cbd3 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
@@ -28,18 +28,34 @@
 
 /**
  * @file
- * Helper
+ * Helper functions for type conversions.
  *
- * LLVM IR doesn't support all basic arithmetic operations we care about (most
- * notably min/max and saturated operations), and it is often necessary to
- * resort machine-specific intrinsics directly. The functions here hide all
- * these implementation details from the other modules.
+ * We want to use the fastest type for a given computation whenever feasible.
+ * The other side of this is that we need to be able convert between several
+ * types accurately and efficiently.
  *
- * We also do simple expressions simplification here. Reasons are:
- * - it is very easy given we have all necessary information readily available
- * - LLVM optimization passes fail to simplify several vector expressions
- * - We often know value constraints which the optimization passes have no way
- *   of knowing, such as when source arguments are known to be in [0, 1] range.
+ * Conversion between types of different bit width is quite complex since a 
+ *
+ * To remember there are a few invariants in type conversions:
+ *
+ * - register width must remain constant:
+ *
+ *     src_type.width * src_type.length == dst_type.width * dst_type.length
+ *
+ * - total number of elements must remain constant:
+ *
+ *     src_type.length * num_srcs == dst_type.length * num_dsts
+ *
+ * It is not always possible to do the conversion both accurately and
+ * efficiently, usually due to lack of adequate machine instructions. In these
+ * cases it is important not to cut shortcuts here and sacrifice accuracy, as
+ * there this functions can be used anywhere. In the future we might have a
+ * precision parameter which can gauge the accuracy vs efficiency compromise,
+ * but for now if the data conversion between two stages happens to be the
+ * bottleneck, then most likely should just avoid converting at all and run
+ * both stages with the same type.
+ *
+ * Make sure to run lp_test_conv unit test after any change to this file.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
@@ -55,6 +71,19 @@
 #include "lp_bld_conv.h"
 
 
+/**
+ * Special case for converting clamped IEEE-754 floats to unsigned norms.
+ *
+ * The mathematical voodoo below may seem excessive but it is actually
+ * paramount we do it this way for several reasons. First, there is no single
+ * precision FP to unsigned integer conversion Intel SSE instruction. Second,
+ * secondly, even if there was, since the FP's mantissa takes only a fraction
+ * of register bits the typically scale and cast approach would require double
+ * precision for accurate results, and therefore half the throughput
+ *
+ * Although the result values can be scaled to an arbitrary bit width specified
+ * by dst_width, the actual result type will have the same width.
+ */
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
                                         union lp_type src_type,
@@ -118,7 +147,7 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 
 
 /**
- * Inverse of lp_build_clamped_float_to_unsigned_norm.
+ * Inverse of lp_build_clamped_float_to_unsigned_norm above.
  */
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
@@ -139,7 +168,6 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
    mantissa = lp_mantissa(dst_type);
 
-   /* We cannot carry more bits than the mantissa */
    n = MIN2(mantissa, src_width);
 
    ubound = ((unsigned long long)1 << n);
@@ -212,6 +240,12 @@ lp_build_const_pack_shuffle(unsigned n)
 }
 
 
+/**
+ * Expand the bit width.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselved.
+ */
 static void
 lp_build_expand(LLVMBuilderRef builder,
                union lp_type src_type,
@@ -270,9 +304,13 @@ lp_build_expand(LLVMBuilderRef builder,
 /**
  * Non-interleaved pack.
  *
- * lo =   __ l0 __ l1 __ l2 __..  __ ln
- * hi  =  __ h0 __ h1 __ h2 __..  __ hn
- * res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
+ * This will move values as
+ *
+ *   lo =   __ l0 __ l1 __ l2 __..  __ ln
+ *   hi =   __ h0 __ h1 __ h2 __..  __ hn
+ *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
+ *
+ * TODO: handle saturation consistently.
  */
 static LLVMValueRef
 lp_build_pack2(LLVMBuilderRef builder,
@@ -347,6 +385,11 @@ lp_build_pack2(LLVMBuilderRef builder,
 }
 
 
+/**
+ * Truncate the bit width.
+ *
+ * TODO: Handle saturation consistently.
+ */
 static LLVMValueRef
 lp_build_trunc(LLVMBuilderRef builder,
                union lp_type src_type,
@@ -392,13 +435,10 @@ lp_build_trunc(LLVMBuilderRef builder,
 
 
 /**
- * Convert between two SIMD types.
+ * Generic type conversion.
  *
- * Converting between SIMD types of different element width poses a problem:
- * SIMD registers have a fixed number of bits, so different element widths
- * imply different vector lengths. Therefore we must multiplex the multiple
- * incoming sources into a single destination vector, or demux a single incoming
- * vector into multiple vectors.
+ * TODO: Take a precision argument, or even better, add a new precision member
+ * to the lp_type union.
  */
 void
 lp_build_conv(LLVMBuilderRef builder,
@@ -605,7 +645,14 @@ lp_build_conv(LLVMBuilderRef builder,
 
 
 /**
- * Convenience wrapper around lp_build_conv for bit masks.
+ * Bit mask conversion.
+ *
+ * This will convert the integer masks that match the given types.
+ *
+ * The mask values should 0 or -1, i.e., all bits either set to zero or one.
+ * Any other value will likely cause in unpredictable results.
+ *
+ * This is basically a very trimmed down version of lp_build_conv.
  */
 void
 lp_build_conv_mask(LLVMBuilderRef builder,
@@ -621,6 +668,8 @@ lp_build_conv_mask(LLVMBuilderRef builder,
    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 
    /*
+    * Drop
+    *
     * We assume all values are 0 or -1
     */
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 118c7c52137..6018feda1c6 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -26,8 +26,34 @@
  **************************************************************************/
 
 /**
+ * @file
  * Depth/stencil testing to LLVM IR translation.
  *
+ * To be done accurately/efficiently the depth/stencil test must be done with
+ * the same type/format of the depth/stencil buffer, which implies massaging
+ * the incoming depths to fit into place. Using a more straightforward
+ * type/format for depth/stencil values internally and only convert when
+ * flushing would avoid this, but it would most likely result in depth fighting
+ * artifacts.
+ *
+ * We are free to use a different pixel layout though. Since our basic
+ * processing unit is a quad (2x2 pixel block) we store the depth/stencil
+ * values tiled, a quad at time. That is, a depth buffer containing 
+ *
+ *  Z11 Z12 Z13 Z14 ...
+ *  Z21 Z22 Z23 Z24 ...
+ *  Z31 Z32 Z33 Z34 ...
+ *  Z41 Z42 Z43 Z44 ...
+ *  ... ... ... ... ...
+ *
+ * will actually be stored in memory as
+ *
+ *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
+ *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
+ *  ... ... ... ... ... ... ... ... ...
+ *
+ * FIXME: Code generate stencil test
+ *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
 
@@ -42,6 +68,9 @@
 #include "lp_bld_depth.h"
 
 
+/**
+ * Return a type appropriate for depth/stencil testing.
+ */
 union lp_type
 lp_depth_type(const struct util_format_description *format_desc,
               unsigned length)
@@ -79,6 +108,9 @@ lp_depth_type(const struct util_format_description *format_desc,
 }
 
 
+/**
+ * Depth test.
+ */
 void
 lp_build_depth_test(LLVMBuilderRef builder,
                     const struct pipe_depth_state *state,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index a5de698ebbf..5d2e042fcc5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -45,9 +45,6 @@ union lp_type;
 struct lp_build_mask_context;
 
 
-/**
- * Return a type appropriate for depth testing.
- */
 union lp_type
 lp_depth_type(const struct util_format_description *format_desc,
               unsigned length);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_intr.c b/src/gallium/drivers/llvmpipe/lp_bld_intr.c
index 4f03ce7d0a9..42fd57fdf05 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_intr.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_intr.c
@@ -28,18 +28,17 @@
 
 /**
  * @file
- * Helper
+ * Helpers for emiting intrinsic calls.
  *
- * LLVM IR doesn't support all basic arithmetic operations we care about (most
- * notably min/max and saturated operations), and it is often necessary to
- * resort machine-specific intrinsics directly. The functions here hide all
- * these implementation details from the other modules.
+ * LLVM vanilla IR doesn't represent all basic arithmetic operations we care
+ * about, and it is often necessary to resort target-specific intrinsics for
+ * performance, convenience.
  *
- * We also do simple expressions simplification here. Reasons are:
- * - it is very easy given we have all necessary information readily available
- * - LLVM optimization passes fail to simplify several vector expressions
- * - We often know value constraints which the optimization passes have no way
- *   of knowing, such as when source arguments are known to be in [0, 1] range.
+ * Ideally we would like to stay away from target specific intrinsics and
+ * move all the instruction selection logic into upstream LLVM where it belongs.
+ *
+ * These functions are also used for calling C functions provided by us from
+ * generated LLVM code.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_intr.h b/src/gallium/drivers/llvmpipe/lp_bld_intr.h
index 1e8e0edd831..3608988dc47 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_intr.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_intr.h
@@ -40,6 +40,9 @@
 #include <llvm-c/Core.h>  
 
 
+/**
+ * Max number of arguments in an intrinsic.
+ */
 #define LP_MAX_FUNC_ARGS 32
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.c b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
index 5b8efb05778..b99fa89be33 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
@@ -25,6 +25,13 @@
  *
  **************************************************************************/
 
+/**
+ * @file
+ * Helper functions for logical operations.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
 
 #include "pipe/p_defines.h"
 #include "lp_bld_type.h"
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
index 5204a851d61..27ca9b0edca 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
@@ -25,6 +25,13 @@
  *
  **************************************************************************/
 
+/**
+ * @file
+ * Helper functions for swizzling/shuffling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
 
 #include "util/u_debug.h"
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
index 7a4aa883824..d7dd6a8a604 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
@@ -27,7 +27,7 @@
 
 /**
  * @file
- * Helper functions for constant building.
+ * Helper functions for swizzling/shuffling.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
index e77cf26de35..8aaf494d2b2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
@@ -25,6 +25,13 @@
  *
  **************************************************************************/
 
+/**
+ * @file
+ * TGSI to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
 #ifndef LP_BLD_TGSI_H
 #define LP_BLD_TGSI_H
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
index d35c8c6b7b9..a37776aa7f8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -26,6 +26,16 @@
  * 
  **************************************************************************/
 
+/**
+ * @file
+ * TGSI to LLVM IR translation -- SoA.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ *
+ * Based on tgsi_sse2.c code written by Michal Krol, Keith Whitwell,
+ * Brian Paul, and others.
+ */
+
 #include "pipe/p_config.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_debug.h"
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 5bd0d7b9827..a9b2d482446 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -26,6 +26,38 @@
  * 
  **************************************************************************/
 
+/**
+ * @file
+ * Code generate the whole fragment pipeline.
+ *
+ * The fragment pipeline consists of the following stages:
+ * - stipple (TBI)
+ * - early depth test
+ * - fragment shader
+ * - alpha test
+ * - depth/stencil test (stencil TBI)
+ * - blending
+ *
+ * This file has only the glue to assembly the fragment pipeline.  The actual
+ * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
+ * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
+ * muster the LLVM JIT execution engine to create a function that follows an
+ * established binary interface and that can be called from C directly.
+ *
+ * A big source of complexity here is that we often want to run different
+ * stages with different precisions and data types and precisions. For example,
+ * the fragment shader needs typically to be done in floats, but the
+ * depth/stencil test and blending is better done in the type that most closely
+ * matches the depth/stencil and color buffer respectively.
+ *
+ * Since the width of a SIMD vector register stays the same regardless of the
+ * element type, different types imply different number of elements, so we must
+ * code generate more instances of the stages with larger types to be able to
+ * feed/consume the stages with smaller types.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "util/u_format.h"
@@ -56,6 +88,14 @@ static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
 static const unsigned char quad_offset_y[4] = {0, 0, 1, 1};
 
 
+/**
+ * Generate the position vectors.
+ *
+ * TODO: This should be called only once per fragment pipeline, for the first
+ * quad, and the neighboring quad positions obtained by additions.
+ *
+ * Parameter x, y are the integer values with the quad upper left coordinates.
+ */
 static void
 generate_pos(LLVMBuilderRef builder,
              LLVMValueRef x,
@@ -74,6 +114,11 @@ generate_pos(LLVMBuilderRef builder,
    unsigned chan;
    unsigned i;
 
+   /*
+    * Derive from the quad's upper left scalar coordinates the coordinates for
+    * all other quad pixels
+    */
+
    x = lp_build_broadcast(builder, int_vec_type, x);
    y = lp_build_broadcast(builder, int_vec_type, y);
 
@@ -91,6 +136,10 @@ generate_pos(LLVMBuilderRef builder,
    pos[0] = x;
    pos[1] = y;
 
+   /* 
+    * Calculate z and w from the interpolation factors.
+    */
+
    for(chan = 2; chan < NUM_CHANNELS; ++chan) {
       LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
       LLVMValueRef a0   = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr,   &index, 1, ""), "");
@@ -111,6 +160,9 @@ generate_pos(LLVMBuilderRef builder,
 }
 
 
+/**
+ * Generate the depth test.
+ */
 static void
 generate_depth(struct llvmpipe_context *lp,
                LLVMBuilderRef builder,
@@ -129,8 +181,10 @@ generate_depth(struct llvmpipe_context *lp,
    format_desc = util_format_description(lp->framebuffer.zsbuf->format);
    assert(format_desc);
 
+   /* Pick the depth type. */
    dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
 
+   /* FIXME: Cope with a depth test type with a different bit width. */
    assert(dst_type.width == src_type.width);
    assert(dst_type.length == src_type.length);
 
@@ -154,7 +208,7 @@ generate_depth(struct llvmpipe_context *lp,
 
 
 /**
- * Generate the fragment shader, depth/stencil and alpha tests.
+ * Generate the fragment shader, depth/stencil test, and alpha tests.
  */
 static void
 generate_fs(struct llvmpipe_context *lp,
@@ -258,10 +312,7 @@ generate_fs(struct llvmpipe_context *lp,
 
 
 /**
- * Generate blending code according to blend->base state.
- * The blend function will look like:
- *    blend(mask, src_color, constant color, dst_color)
- * dst_color will be modified and contain the result of the blend func.
+ * Generate color blending and color output.
  */
 static void
 generate_blend(const struct pipe_blend_state *blend,
@@ -310,6 +361,9 @@ generate_blend(const struct pipe_blend_state *blend,
 }
 
 
+/**
+ * Generate the runtime callable function for the whole fragment pipeline.
+ */
 static struct lp_fragment_shader_variant *
 generate_fragment(struct llvmpipe_context *lp,
                   struct lp_fragment_shader *shader,
@@ -379,6 +433,9 @@ generate_fragment(struct llvmpipe_context *lp,
    variant->shader = shader;
    memcpy(&variant->key, key, sizeof *key);
 
+   /* TODO: actually pick these based on the fs and color buffer
+    * characteristics. */
+
    fs_type.value = 0;
    fs_type.floating = TRUE; /* floating point values */
    fs_type.sign = TRUE;     /* values are signed */
@@ -394,6 +451,11 @@ generate_fragment(struct llvmpipe_context *lp,
    blend_type.width = 8;        /* 8-bit ubyte values */
    blend_type.length = 16;      /* 16 elements per vector */
 
+   /* 
+    * Generate the function prototype. Any change here must be reflected in
+    * lp_state.h's lp_shader_fs_func function pointer type, and vice-versa.
+    */
+
    fs_elem_type = lp_build_elem_type(fs_type);
    fs_vec_type = lp_build_vec_type(fs_type);
    fs_int_vec_type = lp_build_int_vec_type(fs_type);
@@ -442,6 +504,10 @@ generate_fragment(struct llvmpipe_context *lp,
    lp_build_name(depth_ptr, "depth");
    lp_build_name(samplers_ptr, "samplers");
 
+   /*
+    * Function body
+    */
+
    block = LLVMAppendBasicBlock(variant->function, "entry");
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
@@ -479,6 +545,10 @@ generate_fragment(struct llvmpipe_context *lp,
          fs_out_color[chan][i] = out_color[chan];
    }
 
+   /* 
+    * Convert the fs's output color and mask to fit to the blending type. 
+    */
+
    for(chan = 0; chan < NUM_CHANNELS; ++chan) {
       lp_build_conv(builder, fs_type, blend_type,
                     fs_out_color[chan], num_fs,
@@ -490,6 +560,10 @@ generate_fragment(struct llvmpipe_context *lp,
                                fs_mask, num_fs,
                                &blend_mask, 1);
 
+   /*
+    * Blending.
+    */
+
    generate_blend(&key->blend,
                   builder,
                   blend_type,
@@ -498,10 +572,14 @@ generate_fragment(struct llvmpipe_context *lp,
                   NULL /* FIXME: blend_const_color */,
                   color_ptr);
 
-   LLVMBuildRetVoid(builder);;
+   LLVMBuildRetVoid(builder);
 
    LLVMDisposeBuilder(builder);
 
+   /*
+    * Translate the LLVM IR into machine code.
+    */
+
    LLVMRunFunctionPassManager(screen->pass, variant->function);
 
 #ifdef DEBUG
@@ -514,6 +592,9 @@ generate_fragment(struct llvmpipe_context *lp,
       abort();
    }
 
+   /* Tell where the fetch_texel function is, if the shader refers to it.
+    * TODO: this should be done elsewhere.
+    */
    fetch_texel = LLVMGetNamedFunction(screen->module, "fetch_texel");
    if(fetch_texel) {
       static boolean first_time = TRUE;
@@ -616,12 +697,20 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
 }
 
 
-void llvmpipe_update_fs(struct llvmpipe_context *lp)
+void 
+llvmpipe_update_fs(struct llvmpipe_context *lp)
 {
    struct lp_fragment_shader *shader = lp->fs;
    struct lp_fragment_shader_variant_key key;
    struct lp_fragment_shader_variant *variant;
 
+   /* We need to generate several variants of the fragment pipeline to match
+    * all the combinations of the contributing state atoms.
+    *
+    * TODO: there is actually no reason to tie this to context state -- the
+    * generated code could be cached globally in the screen.
+    */
+
    memset(&key, 0, sizeof key);
    memcpy(&key.depth, &lp->depth_stencil->depth, sizeof &key.depth);
    memcpy(&key.alpha, &lp->depth_stencil->alpha, sizeof &key.alpha);
-- 
2.30.2