swrast: initial multi-threaded span rendering

author Andreas Fänger <a.faenger@e-sign.com>

Wed, 10 Aug 2011 08:07:29 +0000 (08:07 +0000)

committer Brian Paul <brianp@vmware.com>

Thu, 11 Aug 2011 14:33:59 +0000 (08:33 -0600)
author Andreas Fänger <a.faenger@e-sign.com>
Wed, 10 Aug 2011 08:07:29 +0000 (08:07 +0000)
committer Brian Paul <brianp@vmware.com>
Thu, 11 Aug 2011 14:33:59 +0000 (08:33 -0600)
diff --git a/common.py b/common.py

index 8657030ea3f6480109e9fb17c12f7f4dbd4f9a9f..cfee1b5dc2e58c03a2f5857586747b21ce4b31f9 100644 (file)
--- a/common.py
+++ b/common.py
@@ -88,6 +88,7 @@ def AddOptions(opts):
         opts.Add('toolchain', 'compiler toolchain', default_toolchain)
         opts.Add(BoolOption('gles', 'EXPERIMENTAL: enable OpenGL ES support', 'no'))
         opts.Add(BoolOption('llvm', 'use LLVM', default_llvm))
+       opts.Add(BoolOption('openmp', 'EXPERIMENTAL: compile with openmp (swrast)', 'no'))
         opts.Add(BoolOption('debug', 'DEPRECATED: debug build', 'yes'))
         opts.Add(BoolOption('profile', 'DEPRECATED: profile build', 'no'))
         opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes'))
diff --git a/scons/gallium.py b/scons/gallium.py

index 8cd3bc7f6e0dd3385c82f5589fefb9b9b89b6f8b..7135251d7a335d5ffe18322d170069f19a749e97 100755 (executable)
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -596,6 +596,18 @@ def generate(env):
          libs += ['m', 'pthread', 'dl']
      env.Append(LIBS = libs)
  
+    # OpenMP
+    if env['openmp']:
+        if env['msvc']:
+            env.Append(CCFLAGS = ['/openmp'])
+            # When building openmp release VS2008 link.exe crashes with LNK1103 error.
+            # Workaround: overwrite PDB flags with empty value as it isn't required anyways
+            if env['build'] == 'release':
+                env['PDB'] = ''
+        if env['gcc']:
+            env.Append(CCFLAGS = ['-fopenmp'])
+            env.Append(LIBS = ['gomp'])
+
      # Load tools
      env.Tool('lex')
      env.Tool('yacc')
diff --git a/src/mesa/swrast/s_aatritemp.h b/src/mesa/swrast/s_aatritemp.h

index 91d4f7a10abb71f1ff4a8654d2102c4c73b06c5d..77b3ae6ec7a8b612edb99c8d7d22927618133b39 100644 (file)
--- a/src/mesa/swrast/s_aatritemp.h
+++ b/src/mesa/swrast/s_aatritemp.h
@@ -181,13 +181,20 @@
        const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
        const GLfloat dxdy = majDx / majDy;
        const GLfloat xAdj = dxdy < 0.0F ? -dxdy : 0.0F;
-      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
        GLint iy;
-      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
+#endif
+      for (iy = iyMin; iy < iyMax; iy++) {
+         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
           GLint ix, startX = (GLint) (x - xAdj);
           GLuint count;
           GLfloat coverage = 0.0F;
  
+#ifdef _OPENMP
+         /* each thread needs to use a different (global) SpanArrays variable */
+         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
+#endif
           /* skip over fragments with zero coverage */
           while (startX < MAX_WIDTH) {
              coverage = compute_coveragef(pMin, pMid, pMax, startX, iy);
@@ -228,13 +235,12 @@
              coverage = compute_coveragef(pMin, pMid, pMax, ix, iy);
           }
           
-         if (ix <= startX)
-            continue;
-         
-         span.x = startX;
-         span.y = iy;
-         span.end = (GLuint) ix - (GLuint) startX;
-         _swrast_write_rgba_span(ctx, &span);
+         if (ix > startX) {
+            span.x = startX;
+            span.y = iy;
+            span.end = (GLuint) ix - (GLuint) startX;
+            _swrast_write_rgba_span(ctx, &span);
+         }
        }
     }
     else {
@@ -244,13 +250,20 @@
        const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
        const GLfloat dxdy = majDx / majDy;
        const GLfloat xAdj = dxdy > 0 ? dxdy : 0.0F;
-      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
        GLint iy;
-      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
+#endif
+      for (iy = iyMin; iy < iyMax; iy++) {
+         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
           GLint ix, left, startX = (GLint) (x + xAdj);
           GLuint count, n;
           GLfloat coverage = 0.0F;
           
+#ifdef _OPENMP
+         /* each thread needs to use a different (global) SpanArrays variable */
+         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
+#endif
           /* make sure we're not past the window edge */
           if (startX >= ctx->DrawBuffer->_Xmax) {
              startX = ctx->DrawBuffer->_Xmax - 1;
@@ -296,31 +309,30 @@
           ATTRIB_LOOP_END
  #endif
  
-         if (startX <= ix)
-            continue;
-
-         n = (GLuint) startX - (GLuint) ix;
+         if (startX > ix) {
+            n = (GLuint) startX - (GLuint) ix;
  
-         left = ix + 1;
+            left = ix + 1;
  
-         /* shift all values to the left */
-         /* XXX this is temporary */
-         {
-            SWspanarrays *array = span.array;
-            GLint j;
-            for (j = 0; j < (GLint) n; j++) {
-               array->coverage[j] = array->coverage[j + left];
-               COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
+            /* shift all values to the left */
+            /* XXX this is temporary */
+            {
+               SWspanarrays *array = span.array;
+               GLint j;
+               for (j = 0; j < (GLint) n; j++) {
+                  array->coverage[j] = array->coverage[j + left];
+                  COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
  #ifdef DO_Z
-               array->z[j] = array->z[j + left];
+                  array->z[j] = array->z[j + left];
  #endif
+               }
              }
-         }
  
-         span.x = left;
-         span.y = iy;
-         span.end = n;
-         _swrast_write_rgba_span(ctx, &span);
+            span.x = left;
+            span.y = iy;
+            span.end = n;
+            _swrast_write_rgba_span(ctx, &span);
+         }
        }
     }
  }
diff --git a/src/mesa/swrast/s_context.c b/src/mesa/swrast/s_context.c

index def1531d7ffe7e0306635c25ff7a4b0cf9259c1f..4434f11b9906443c9728320633371a1a8b54264d 100644 (file)
--- a/src/mesa/swrast/s_context.c
+++ b/src/mesa/swrast/s_context.c
@@ -772,6 +772,11 @@ _swrast_CreateContext( struct gl_context *ctx )
  {
     GLuint i;
     SWcontext *swrast = (SWcontext *)CALLOC(sizeof(SWcontext));
+#ifdef _OPENMP
+   const GLint maxThreads = omp_get_max_threads();
+#else
+   const GLint maxThreads = 1;
+#endif
  
     if (SWRAST_DEBUG) {
        _mesa_debug(ctx, "_swrast_CreateContext\n");
@@ -806,19 +811,25 @@ _swrast_CreateContext( struct gl_context *ctx )
     for (i = 0; i < MAX_TEXTURE_IMAGE_UNITS; i++)
        swrast->TextureSample[i] = NULL;
  
-   swrast->SpanArrays = MALLOC_STRUCT(sw_span_arrays);
+   /* SpanArrays is global and shared by all SWspan instances. However, when
+    * using multiple threads, it is necessary to have one SpanArrays instance
+    * per thread.
+    */
+   swrast->SpanArrays = (SWspanarrays *) MALLOC(maxThreads * sizeof(SWspanarrays));
     if (!swrast->SpanArrays) {
        FREE(swrast);
        return GL_FALSE;
     }
-   swrast->SpanArrays->ChanType = CHAN_TYPE;
+   for(i = 0; i < maxThreads; i++) {
+      swrast->SpanArrays[i].ChanType = CHAN_TYPE;
  #if CHAN_TYPE == GL_UNSIGNED_BYTE
-   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba8;
+      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba8;
  #elif CHAN_TYPE == GL_UNSIGNED_SHORT
-   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba16;
+      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba16;
  #else
-   swrast->SpanArrays->rgba = swrast->SpanArrays->attribs[FRAG_ATTRIB_COL0];
+      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].attribs[FRAG_ATTRIB_COL0];
  #endif
+   }
  
     /* init point span buffer */
     swrast->PointSpan.primitive = GL_POINT;
@@ -826,7 +837,10 @@ _swrast_CreateContext( struct gl_context *ctx )
     swrast->PointSpan.facing = 0;
     swrast->PointSpan.array = swrast->SpanArrays;
  
-   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits *
+   /* TexelBuffer is also global and normally shared by all SWspan instances;
+    * when running with multiple threads, create one per thread.
+    */
+   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits * maxThreads *
                                             MAX_WIDTH * 4 * sizeof(GLfloat));
     if (!swrast->TexelBuffer) {
        FREE(swrast->SpanArrays);
diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c

index 086ed0b33d7477dab0af85222d690f110b3b10f2..80b9dff3cc26be0e6ec096c41f4207c37d1f88a8 100644 (file)
--- a/src/mesa/swrast/s_texcombine.c
+++ b/src/mesa/swrast/s_texcombine.c
@@ -48,7 +48,11 @@ typedef float (*float4_array)[4];
  static INLINE float4_array
  get_texel_array(SWcontext *swrast, GLuint unit)
  {
+#ifdef _OPENMP
+   return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4 * omp_get_num_threads() + (MAX_WIDTH * 4 * omp_get_thread_num()));
+#else
     return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4);
+#endif
  }
  
  
diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c

index 18f095f0d4bfc3ef24ba346ab4f775e9945024ba..881d5d5f53550e0db0d76d61d98cc7f3ef553218 100644 (file)
--- a/src/mesa/tnl/t_pipeline.c
+++ b/src/mesa/tnl/t_pipeline.c
@@ -146,7 +146,17 @@ void _tnl_run_pipeline( struct gl_context *ctx )
          _tnl_notify_pipeline_output_change( ctx );
     }
  
+#ifndef _OPENMP
+   /* Don't adjust FPU precision mode in case multiple threads are to be used.
+    * This would require that the additional threads also changed the FPU mode
+    * which is quite a mess as this had to be done in all parallelized sections;
+    * otherwise the master thread and all other threads are running in different
+    * modes, producing inconsistent results.
+    * Note that all x64 implementations don't define/use START_FAST_MATH, so
+    * this is "hack" is only used in i386 mode
+    */
     START_FAST_MATH(__tmp);
+#endif
  
     for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
        struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
@@ -154,7 +164,9 @@ void _tnl_run_pipeline( struct gl_context *ctx )
          break;
     }
  
+#ifndef _OPENMP
     END_FAST_MATH(__tmp);
+#endif
  }
author	Andreas Fänger <a.faenger@e-sign.com>
	Wed, 10 Aug 2011 08:07:29 +0000 (08:07 +0000)
committer	Brian Paul <brianp@vmware.com>
	Thu, 11 Aug 2011 14:33:59 +0000 (08:33 -0600)
common.py		patch \| blob \| history
scons/gallium.py		patch \| blob \| history
src/mesa/swrast/s_aatritemp.h		patch \| blob \| history
src/mesa/swrast/s_context.c		patch \| blob \| history
src/mesa/swrast/s_texcombine.c		patch \| blob \| history
src/mesa/tnl/t_pipeline.c		patch \| blob \| history