Description/rationale/to-do items, while I still remember them...
lp_bld_alpha.c \
lp_bld_arit.c \
lp_bld_blend_aos.c \
+ lp_bld_blend_logicop.c \
lp_bld_blend_soa.c \
lp_bld_const.c \
lp_bld_conv.c \
lp_bld_load.c \
lp_bld_store.c \
lp_bld_logic.c \
- lp_bld_logicop.c \
lp_bld_swizzle.c \
lp_bld_tgsi_soa.c \
lp_bld_type.c \
Done so far is:
-- TGSI -> LLVM fragment shader translation
- - same level of support as the TGSI SSE2 exec machine
- - texture sampling via an intrinsic call
- - done in SoA
- - input interpolation also code generated
-
-- blend -> LLVM (including logic ops)
- - SoA and AoS, but only the former used
-
-- code is generic
- - intermediates can be vectors of floats, ubytes, fixed point, etc, and of
- any width and length
- - not all operations are implemented for these types yet though
+ - the whole fragment pipeline is code generated in a single function
+
+ - depth testing
+
+ - fragment shader TGSI translation
+ - same level of support as the TGSI SSE2 exec machine, with the exception
+ we don't fallback to TGSI interpretation when an unsupported opcode is
+ found, but just ignore it
+ - texture sampling via an intrinsic call
+ - done in SoA layout
+ - input interpolation also code generated
+
+ - alpha testing
+
+ - blend (including logic ops)
+ - both in SoA and AoS layouts, but only the former used for now
+
+ - code is generic
+ - intermediates can be vectors of floats, ubytes, fixed point, etc, and of
+ any width and length
+ - not all operations are implemented for these types yet though
Most mesa/progs/demos/* work. Speed is on par with Keith's softpipe-opt branch,
which includes hand written fast implementations for common cases.
To do (probably by this order):
-- code generate the rest of the fragment pipeline, namely the
- depth/alpha/stencil state
-- concatenate the fragment pipeline (shader + depth/stencil/alpha + blend) in a
- single function
-- code generate texture sampling
-- translate TGSI control flow instructions
-- code generate the triangle setup and rasterization
+
+ - code generate stipple and stencil testing
+
+ - code generate texture sampling
+
+ - translate TGSI control flow instructions, and all other remaining opcodes
+
+ - code generate the triangle setup and rasterization
Requirements
instructions. This is necessary because we emit several SSE intrinsics for
convenience. See /proc/cpuinfo to know what your CPU supports.
- - scons (although it should be straightforward to fix the Makefiles as well)
+ - scons
Building
scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=xlib dri=false -k
+Alternatively, you can build it with GNU make, if you prefer, by invoking it as
+
+ make linux-llvm
+
+but the rest of these instructions assume scons is used.
+
Using
=====
Building will create a drop-in alternative for libGL.so. To use it set the
environment variables:
- export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$PWD/build/linux-x86_64-debug/lib:$LD_LIBRARY_PATH
+or
+
+ export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib:$LD_LIBRARY_PATH
+
Unit testing
============
Some of this tests can output results and benchmarks to a tab-seperated-file
for posterior analysis, e.g.:
- build/linux-x86_64/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
+ build/linux-x86_64-debug/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
Development Notes
=================
+- When looking to this code by the first time start in lp_state_fs.c, and
+ then skim through the lp_bld_* functions called in there, and the comments
+ at the top of the lp_bld_*.c functions.
+
+- All lp_bld_*.[ch] are isolated from the rest of the driver, and could/may be
+ put in a standalone Gallium state -> LLVM IR translation module.
+
- We use LLVM-C bindings for now. They are not documented, but follow the C++
interfaces very closely, and appear to be complete enough for code
generation. See
'lp_bld_alpha.c',
'lp_bld_arit.c',
'lp_bld_blend_aos.c',
+ 'lp_bld_blend_logicop.c',
'lp_bld_blend_soa.c',
'lp_bld_const.c',
'lp_bld_conv.c',
'lp_bld_load.c',
'lp_bld_store.c',
'lp_bld_logic.c',
- 'lp_bld_logicop.c',
'lp_bld_swizzle.c',
'lp_bld_tgsi_soa.c',
'lp_bld_type.c',
/**
* @file
- * Blend LLVM IR generation -- AOS form.
+ * Blend LLVM IR generation -- AoS layout.
+ *
+ * AoS blending is in general much slower than SoA, but there are some cases
+ * where it might be faster. In particular, if a pixel is rendered only once
+ * then the overhead of tiling and untiling will dominate over the speedup that
+ * SoA gives. So we might want to detect such cases and fallback to AoS in the
+ * future, but for now this function is here for historical/benchmarking
+ * purposes.
+ *
+ * Run lp_blend_test after any change to this file.
*
* @author Jose Fonseca <jfonseca@vmware.com>
*/
--- /dev/null
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- logic ops.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+
+#include "lp_bld_blend.h"
+
+
+LLVMValueRef
+lp_build_logicop(LLVMBuilderRef builder,
+ unsigned logicop_func,
+ LLVMValueRef src,
+ LLVMValueRef dst)
+{
+ LLVMTypeRef type;
+ LLVMValueRef res;
+
+ type = LLVMTypeOf(src);
+
+ switch (logicop_func) {
+ case PIPE_LOGICOP_CLEAR:
+ res = LLVMConstNull(type);
+ break;
+ case PIPE_LOGICOP_NOR:
+ res = LLVMBuildNot(builder, LLVMBuildOr(builder, src, dst, ""), "");
+ break;
+ case PIPE_LOGICOP_AND_INVERTED:
+ res = LLVMBuildAnd(builder, LLVMBuildNot(builder, src, ""), dst, "");
+ break;
+ case PIPE_LOGICOP_COPY_INVERTED:
+ res = LLVMBuildNot(builder, src, "");
+ break;
+ case PIPE_LOGICOP_AND_REVERSE:
+ res = LLVMBuildAnd(builder, src, LLVMBuildNot(builder, dst, ""), "");
+ break;
+ case PIPE_LOGICOP_INVERT:
+ res = LLVMBuildNot(builder, dst, "");
+ break;
+ case PIPE_LOGICOP_XOR:
+ res = LLVMBuildXor(builder, src, dst, "");
+ break;
+ case PIPE_LOGICOP_NAND:
+ res = LLVMBuildNot(builder, LLVMBuildAnd(builder, src, dst, ""), "");
+ break;
+ case PIPE_LOGICOP_AND:
+ res = LLVMBuildAnd(builder, src, dst, "");
+ break;
+ case PIPE_LOGICOP_EQUIV:
+ res = LLVMBuildNot(builder, LLVMBuildXor(builder, src, dst, ""), "");
+ break;
+ case PIPE_LOGICOP_NOOP:
+ res = dst;
+ break;
+ case PIPE_LOGICOP_OR_INVERTED:
+ res = LLVMBuildOr(builder, LLVMBuildNot(builder, src, ""), dst, "");
+ break;
+ case PIPE_LOGICOP_COPY:
+ res = src;
+ break;
+ case PIPE_LOGICOP_OR_REVERSE:
+ res = LLVMBuildOr(builder, src, LLVMBuildNot(builder, dst, ""), "");
+ break;
+ case PIPE_LOGICOP_OR:
+ res = LLVMBuildOr(builder, src, dst, "");
+ break;
+ case PIPE_LOGICOP_SET:
+ res = LLVMConstAllOnes(type);
+ break;
+ default:
+ assert(0);
+ res = src;
+ }
+
+ return res;
+}
/**
* @file
- * Blend LLVM IR generation -- SoA.
+ * Blend LLVM IR generation -- SoA layout.
+ *
+ * Blending in SoA is much faster than AoS, especially when separate rgb/alpha
+ * factors/functions are used, since no channel masking/shuffling is necessary
+ * and we can achieve the full throughput of the SIMD operations. Furthermore
+ * the fragment shader output is also in SoA, so it fits nicely with the rest of
+ * the fragment pipeline.
+ *
+ * The drawback is that to be displayed the color buffer needs to be in AoS
+ * layout, so we need to tile/untile the color buffer before/after rendering.
+ * A color buffer like
+ *
+ * R11 G11 B11 A11 R12 G12 B12 A12 R13 G13 B13 A13 R14 G14 B14 A14 ...
+ * R21 G21 B21 A21 R22 G22 B22 A22 R23 G23 B23 A23 R24 G24 B24 A24 ...
+ *
+ * R31 G31 B31 A31 R32 G32 B32 A32 R33 G33 B33 A33 R34 G34 B34 A34 ...
+ * R41 G41 B41 A41 R42 G42 B42 A42 R43 G43 B43 A43 R44 G44 B44 A44 ...
+ *
+ * ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
+ *
+ * will actually be stored in memory as
+ *
+ * R11 R12 R21 R22 R13 R14 R23 R24 ... G11 G12 G21 G22 G13 G14 G23 G24 ... B11 B12 B21 B22 B13 B14 B23 B24 ... A11 A12 A21 A22 A13 A14 A23 A24 ...
+ * R31 R32 R41 R42 R33 R34 R43 R44 ... G31 G32 G41 G42 G33 G34 G43 G44 ... B31 B32 B41 B42 B33 B34 B43 B44 ... A31 A32 A41 A42 A33 A34 A43 A44 ...
+ * ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
+ *
+ * NOTE: Run lp_blend_test after any change to this file.
+ *
+ * You can also run lp_blend_test to obtain AoS vs SoA benchmarks. Invoking it
+ * as:
+ *
+ * lp_blend_test -o blend.tsv
+ *
+ * will generate a tab-seperated-file with the test results and performance
+ * measurements.
*
* @author Jose Fonseca <jfonseca@vmware.com>
*/
/**
* @file
- * Helper
+ * Helper functions for type conversions.
*
- * LLVM IR doesn't support all basic arithmetic operations we care about (most
- * notably min/max and saturated operations), and it is often necessary to
- * resort machine-specific intrinsics directly. The functions here hide all
- * these implementation details from the other modules.
+ * We want to use the fastest type for a given computation whenever feasible.
+ * The other side of this is that we need to be able convert between several
+ * types accurately and efficiently.
*
- * We also do simple expressions simplification here. Reasons are:
- * - it is very easy given we have all necessary information readily available
- * - LLVM optimization passes fail to simplify several vector expressions
- * - We often know value constraints which the optimization passes have no way
- * of knowing, such as when source arguments are known to be in [0, 1] range.
+ * Conversion between types of different bit width is quite complex since a
+ *
+ * To remember there are a few invariants in type conversions:
+ *
+ * - register width must remain constant:
+ *
+ * src_type.width * src_type.length == dst_type.width * dst_type.length
+ *
+ * - total number of elements must remain constant:
+ *
+ * src_type.length * num_srcs == dst_type.length * num_dsts
+ *
+ * It is not always possible to do the conversion both accurately and
+ * efficiently, usually due to lack of adequate machine instructions. In these
+ * cases it is important not to cut shortcuts here and sacrifice accuracy, as
+ * there this functions can be used anywhere. In the future we might have a
+ * precision parameter which can gauge the accuracy vs efficiency compromise,
+ * but for now if the data conversion between two stages happens to be the
+ * bottleneck, then most likely should just avoid converting at all and run
+ * both stages with the same type.
+ *
+ * Make sure to run lp_test_conv unit test after any change to this file.
*
* @author Jose Fonseca <jfonseca@vmware.com>
*/
#include "lp_bld_conv.h"
+/**
+ * Special case for converting clamped IEEE-754 floats to unsigned norms.
+ *
+ * The mathematical voodoo below may seem excessive but it is actually
+ * paramount we do it this way for several reasons. First, there is no single
+ * precision FP to unsigned integer conversion Intel SSE instruction. Second,
+ * secondly, even if there was, since the FP's mantissa takes only a fraction
+ * of register bits the typically scale and cast approach would require double
+ * precision for accurate results, and therefore half the throughput
+ *
+ * Although the result values can be scaled to an arbitrary bit width specified
+ * by dst_width, the actual result type will have the same width.
+ */
LLVMValueRef
lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
union lp_type src_type,
/**
- * Inverse of lp_build_clamped_float_to_unsigned_norm.
+ * Inverse of lp_build_clamped_float_to_unsigned_norm above.
*/
LLVMValueRef
lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
mantissa = lp_mantissa(dst_type);
- /* We cannot carry more bits than the mantissa */
n = MIN2(mantissa, src_width);
ubound = ((unsigned long long)1 << n);
}
+/**
+ * Expand the bit width.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselved.
+ */
static void
lp_build_expand(LLVMBuilderRef builder,
union lp_type src_type,
/**
* Non-interleaved pack.
*
- * lo = __ l0 __ l1 __ l2 __.. __ ln
- * hi = __ h0 __ h1 __ h2 __.. __ hn
- * res = l0 l1 l2 .. ln h0 h1 h2 .. hn
+ * This will move values as
+ *
+ * lo = __ l0 __ l1 __ l2 __.. __ ln
+ * hi = __ h0 __ h1 __ h2 __.. __ hn
+ * res = l0 l1 l2 .. ln h0 h1 h2 .. hn
+ *
+ * TODO: handle saturation consistently.
*/
static LLVMValueRef
lp_build_pack2(LLVMBuilderRef builder,
}
+/**
+ * Truncate the bit width.
+ *
+ * TODO: Handle saturation consistently.
+ */
static LLVMValueRef
lp_build_trunc(LLVMBuilderRef builder,
union lp_type src_type,
/**
- * Convert between two SIMD types.
+ * Generic type conversion.
*
- * Converting between SIMD types of different element width poses a problem:
- * SIMD registers have a fixed number of bits, so different element widths
- * imply different vector lengths. Therefore we must multiplex the multiple
- * incoming sources into a single destination vector, or demux a single incoming
- * vector into multiple vectors.
+ * TODO: Take a precision argument, or even better, add a new precision member
+ * to the lp_type union.
*/
void
lp_build_conv(LLVMBuilderRef builder,
/**
- * Convenience wrapper around lp_build_conv for bit masks.
+ * Bit mask conversion.
+ *
+ * This will convert the integer masks that match the given types.
+ *
+ * The mask values should 0 or -1, i.e., all bits either set to zero or one.
+ * Any other value will likely cause in unpredictable results.
+ *
+ * This is basically a very trimmed down version of lp_build_conv.
*/
void
lp_build_conv_mask(LLVMBuilderRef builder,
assert(src_type.length * num_srcs == dst_type.length * num_dsts);
/*
+ * Drop
+ *
* We assume all values are 0 or -1
*/
**************************************************************************/
/**
+ * @file
* Depth/stencil testing to LLVM IR translation.
*
+ * To be done accurately/efficiently the depth/stencil test must be done with
+ * the same type/format of the depth/stencil buffer, which implies massaging
+ * the incoming depths to fit into place. Using a more straightforward
+ * type/format for depth/stencil values internally and only convert when
+ * flushing would avoid this, but it would most likely result in depth fighting
+ * artifacts.
+ *
+ * We are free to use a different pixel layout though. Since our basic
+ * processing unit is a quad (2x2 pixel block) we store the depth/stencil
+ * values tiled, a quad at time. That is, a depth buffer containing
+ *
+ * Z11 Z12 Z13 Z14 ...
+ * Z21 Z22 Z23 Z24 ...
+ * Z31 Z32 Z33 Z34 ...
+ * Z41 Z42 Z43 Z44 ...
+ * ... ... ... ... ...
+ *
+ * will actually be stored in memory as
+ *
+ * Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
+ * Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
+ * ... ... ... ... ... ... ... ... ...
+ *
+ * FIXME: Code generate stencil test
+ *
* @author Jose Fonseca <jfonseca@vmware.com>
*/
#include "lp_bld_depth.h"
+/**
+ * Return a type appropriate for depth/stencil testing.
+ */
union lp_type
lp_depth_type(const struct util_format_description *format_desc,
unsigned length)
}
+/**
+ * Depth test.
+ */
void
lp_build_depth_test(LLVMBuilderRef builder,
const struct pipe_depth_state *state,
struct lp_build_mask_context;
-/**
- * Return a type appropriate for depth testing.
- */
union lp_type
lp_depth_type(const struct util_format_description *format_desc,
unsigned length);
/**
* @file
- * Helper
+ * Helpers for emiting intrinsic calls.
*
- * LLVM IR doesn't support all basic arithmetic operations we care about (most
- * notably min/max and saturated operations), and it is often necessary to
- * resort machine-specific intrinsics directly. The functions here hide all
- * these implementation details from the other modules.
+ * LLVM vanilla IR doesn't represent all basic arithmetic operations we care
+ * about, and it is often necessary to resort target-specific intrinsics for
+ * performance, convenience.
*
- * We also do simple expressions simplification here. Reasons are:
- * - it is very easy given we have all necessary information readily available
- * - LLVM optimization passes fail to simplify several vector expressions
- * - We often know value constraints which the optimization passes have no way
- * of knowing, such as when source arguments are known to be in [0, 1] range.
+ * Ideally we would like to stay away from target specific intrinsics and
+ * move all the instruction selection logic into upstream LLVM where it belongs.
+ *
+ * These functions are also used for calling C functions provided by us from
+ * generated LLVM code.
*
* @author Jose Fonseca <jfonseca@vmware.com>
*/
#include <llvm-c/Core.h>
+/**
+ * Max number of arguments in an intrinsic.
+ */
#define LP_MAX_FUNC_ARGS 32
*
**************************************************************************/
+/**
+ * @file
+ * Helper functions for logical operations.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
#include "pipe/p_defines.h"
#include "lp_bld_type.h"
+++ /dev/null
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include "pipe/p_state.h"
-
-#include "lp_bld_blend.h"
-
-
-LLVMValueRef
-lp_build_logicop(LLVMBuilderRef builder,
- unsigned logicop_func,
- LLVMValueRef src,
- LLVMValueRef dst)
-{
- LLVMTypeRef type;
- LLVMValueRef res;
-
- type = LLVMTypeOf(src);
-
- switch (logicop_func) {
- case PIPE_LOGICOP_CLEAR:
- res = LLVMConstNull(type);
- break;
- case PIPE_LOGICOP_NOR:
- res = LLVMBuildNot(builder, LLVMBuildOr(builder, src, dst, ""), "");
- break;
- case PIPE_LOGICOP_AND_INVERTED:
- res = LLVMBuildAnd(builder, LLVMBuildNot(builder, src, ""), dst, "");
- break;
- case PIPE_LOGICOP_COPY_INVERTED:
- res = LLVMBuildNot(builder, src, "");
- break;
- case PIPE_LOGICOP_AND_REVERSE:
- res = LLVMBuildAnd(builder, src, LLVMBuildNot(builder, dst, ""), "");
- break;
- case PIPE_LOGICOP_INVERT:
- res = LLVMBuildNot(builder, dst, "");
- break;
- case PIPE_LOGICOP_XOR:
- res = LLVMBuildXor(builder, src, dst, "");
- break;
- case PIPE_LOGICOP_NAND:
- res = LLVMBuildNot(builder, LLVMBuildAnd(builder, src, dst, ""), "");
- break;
- case PIPE_LOGICOP_AND:
- res = LLVMBuildAnd(builder, src, dst, "");
- break;
- case PIPE_LOGICOP_EQUIV:
- res = LLVMBuildNot(builder, LLVMBuildXor(builder, src, dst, ""), "");
- break;
- case PIPE_LOGICOP_NOOP:
- res = dst;
- break;
- case PIPE_LOGICOP_OR_INVERTED:
- res = LLVMBuildOr(builder, LLVMBuildNot(builder, src, ""), dst, "");
- break;
- case PIPE_LOGICOP_COPY:
- res = src;
- break;
- case PIPE_LOGICOP_OR_REVERSE:
- res = LLVMBuildOr(builder, src, LLVMBuildNot(builder, dst, ""), "");
- break;
- case PIPE_LOGICOP_OR:
- res = LLVMBuildOr(builder, src, dst, "");
- break;
- case PIPE_LOGICOP_SET:
- res = LLVMConstAllOnes(type);
- break;
- default:
- assert(0);
- res = src;
- }
-
- return res;
-}
*
**************************************************************************/
+/**
+ * @file
+ * Helper functions for swizzling/shuffling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
#include "util/u_debug.h"
/**
* @file
- * Helper functions for constant building.
+ * Helper functions for swizzling/shuffling.
*
* @author Jose Fonseca <jfonseca@vmware.com>
*/
*
**************************************************************************/
+/**
+ * @file
+ * TGSI to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
#ifndef LP_BLD_TGSI_H
#define LP_BLD_TGSI_H
*
**************************************************************************/
+/**
+ * @file
+ * TGSI to LLVM IR translation -- SoA.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ *
+ * Based on tgsi_sse2.c code written by Michal Krol, Keith Whitwell,
+ * Brian Paul, and others.
+ */
+
#include "pipe/p_config.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_debug.h"
*
**************************************************************************/
+/**
+ * @file
+ * Code generate the whole fragment pipeline.
+ *
+ * The fragment pipeline consists of the following stages:
+ * - stipple (TBI)
+ * - early depth test
+ * - fragment shader
+ * - alpha test
+ * - depth/stencil test (stencil TBI)
+ * - blending
+ *
+ * This file has only the glue to assembly the fragment pipeline. The actual
+ * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
+ * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
+ * muster the LLVM JIT execution engine to create a function that follows an
+ * established binary interface and that can be called from C directly.
+ *
+ * A big source of complexity here is that we often want to run different
+ * stages with different precisions and data types and precisions. For example,
+ * the fragment shader needs typically to be done in floats, but the
+ * depth/stencil test and blending is better done in the type that most closely
+ * matches the depth/stencil and color buffer respectively.
+ *
+ * Since the width of a SIMD vector register stays the same regardless of the
+ * element type, different types imply different number of elements, so we must
+ * code generate more instances of the stages with larger types to be able to
+ * feed/consume the stages with smaller types.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
#include "pipe/p_defines.h"
#include "util/u_memory.h"
#include "util/u_format.h"
static const unsigned char quad_offset_y[4] = {0, 0, 1, 1};
+/**
+ * Generate the position vectors.
+ *
+ * TODO: This should be called only once per fragment pipeline, for the first
+ * quad, and the neighboring quad positions obtained by additions.
+ *
+ * Parameter x, y are the integer values with the quad upper left coordinates.
+ */
static void
generate_pos(LLVMBuilderRef builder,
LLVMValueRef x,
unsigned chan;
unsigned i;
+ /*
+ * Derive from the quad's upper left scalar coordinates the coordinates for
+ * all other quad pixels
+ */
+
x = lp_build_broadcast(builder, int_vec_type, x);
y = lp_build_broadcast(builder, int_vec_type, y);
pos[0] = x;
pos[1] = y;
+ /*
+ * Calculate z and w from the interpolation factors.
+ */
+
for(chan = 2; chan < NUM_CHANNELS; ++chan) {
LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
LLVMValueRef a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
}
+/**
+ * Generate the depth test.
+ */
static void
generate_depth(struct llvmpipe_context *lp,
LLVMBuilderRef builder,
format_desc = util_format_description(lp->framebuffer.zsbuf->format);
assert(format_desc);
+ /* Pick the depth type. */
dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
+ /* FIXME: Cope with a depth test type with a different bit width. */
assert(dst_type.width == src_type.width);
assert(dst_type.length == src_type.length);
/**
- * Generate the fragment shader, depth/stencil and alpha tests.
+ * Generate the fragment shader, depth/stencil test, and alpha tests.
*/
static void
generate_fs(struct llvmpipe_context *lp,
/**
- * Generate blending code according to blend->base state.
- * The blend function will look like:
- * blend(mask, src_color, constant color, dst_color)
- * dst_color will be modified and contain the result of the blend func.
+ * Generate color blending and color output.
*/
static void
generate_blend(const struct pipe_blend_state *blend,
}
+/**
+ * Generate the runtime callable function for the whole fragment pipeline.
+ */
static struct lp_fragment_shader_variant *
generate_fragment(struct llvmpipe_context *lp,
struct lp_fragment_shader *shader,
variant->shader = shader;
memcpy(&variant->key, key, sizeof *key);
+ /* TODO: actually pick these based on the fs and color buffer
+ * characteristics. */
+
fs_type.value = 0;
fs_type.floating = TRUE; /* floating point values */
fs_type.sign = TRUE; /* values are signed */
blend_type.width = 8; /* 8-bit ubyte values */
blend_type.length = 16; /* 16 elements per vector */
+ /*
+ * Generate the function prototype. Any change here must be reflected in
+ * lp_state.h's lp_shader_fs_func function pointer type, and vice-versa.
+ */
+
fs_elem_type = lp_build_elem_type(fs_type);
fs_vec_type = lp_build_vec_type(fs_type);
fs_int_vec_type = lp_build_int_vec_type(fs_type);
lp_build_name(depth_ptr, "depth");
lp_build_name(samplers_ptr, "samplers");
+ /*
+ * Function body
+ */
+
block = LLVMAppendBasicBlock(variant->function, "entry");
builder = LLVMCreateBuilder();
LLVMPositionBuilderAtEnd(builder, block);
fs_out_color[chan][i] = out_color[chan];
}
+ /*
+ * Convert the fs's output color and mask to fit to the blending type.
+ */
+
for(chan = 0; chan < NUM_CHANNELS; ++chan) {
lp_build_conv(builder, fs_type, blend_type,
fs_out_color[chan], num_fs,
fs_mask, num_fs,
&blend_mask, 1);
+ /*
+ * Blending.
+ */
+
generate_blend(&key->blend,
builder,
blend_type,
NULL /* FIXME: blend_const_color */,
color_ptr);
- LLVMBuildRetVoid(builder);;
+ LLVMBuildRetVoid(builder);
LLVMDisposeBuilder(builder);
+ /*
+ * Translate the LLVM IR into machine code.
+ */
+
LLVMRunFunctionPassManager(screen->pass, variant->function);
#ifdef DEBUG
abort();
}
+ /* Tell where the fetch_texel function is, if the shader refers to it.
+ * TODO: this should be done elsewhere.
+ */
fetch_texel = LLVMGetNamedFunction(screen->module, "fetch_texel");
if(fetch_texel) {
static boolean first_time = TRUE;
}
-void llvmpipe_update_fs(struct llvmpipe_context *lp)
+void
+llvmpipe_update_fs(struct llvmpipe_context *lp)
{
struct lp_fragment_shader *shader = lp->fs;
struct lp_fragment_shader_variant_key key;
struct lp_fragment_shader_variant *variant;
+ /* We need to generate several variants of the fragment pipeline to match
+ * all the combinations of the contributing state atoms.
+ *
+ * TODO: there is actually no reason to tie this to context state -- the
+ * generated code could be cached globally in the screen.
+ */
+
memset(&key, 0, sizeof key);
memcpy(&key.depth, &lp->depth_stencil->depth, sizeof &key.depth);
memcpy(&key.alpha, &lp->depth_stencil->alpha, sizeof &key.alpha);