From: José Fonseca <jfonseca@vmware.com>
Date: Fri, 9 Apr 2010 15:30:52 +0000 (+0100)
Subject: util: Get all depth stencil tests passing.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=866d22ccce93fa507aca06aaa25217d18b1cdd0f;p=mesa.git

util: Get all depth stencil tests passing.

Note the tests don't test all the paths, in particular stride, and
ensuring the old value is preserved.
---

diff --git a/progs/gallium/unit/u_format_test.c b/progs/gallium/unit/u_format_test.c
index 442023b512e..cfde6af75e0 100644
--- a/progs/gallium/unit/u_format_test.c
+++ b/progs/gallium/unit/u_format_test.c
@@ -71,7 +71,7 @@ print_packed(const struct util_format_description *format_desc,
 
 
 static void
-print_unpacked_doubl(const struct util_format_description *format_desc,
+print_unpacked_rgba_doubl(const struct util_format_description *format_desc,
                      const char *prefix,
                      const double unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4],
                      const char *suffix)
@@ -92,7 +92,7 @@ print_unpacked_doubl(const struct util_format_description *format_desc,
 
 
 static void
-print_unpacked_float(const struct util_format_description *format_desc,
+print_unpacked_rgba_float(const struct util_format_description *format_desc,
                      const char *prefix,
                      float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4],
                      const char *suffix)
@@ -113,7 +113,7 @@ print_unpacked_float(const struct util_format_description *format_desc,
 
 
 static void
-print_unpacked_8unorm(const struct util_format_description *format_desc,
+print_unpacked_rgba_8unorm(const struct util_format_description *format_desc,
                       const char *prefix,
                       uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH][4],
                       const char *suffix)
@@ -132,6 +132,67 @@ print_unpacked_8unorm(const struct util_format_description *format_desc,
 }
 
 
+static void
+print_unpacked_z_float(const struct util_format_description *format_desc,
+                       const char *prefix,
+                       float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH],
+                       const char *suffix)
+{
+   unsigned i, j;
+   const char *sep = "";
+
+   printf("%s", prefix);
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         printf("%s%f", sep, unpacked[i][j]);
+         sep = ", ";
+      }
+      sep = ",\n";
+   }
+   printf("%s", suffix);
+}
+
+
+static void
+print_unpacked_z_32unorm(const struct util_format_description *format_desc,
+                         const char *prefix,
+                         uint32_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH],
+                         const char *suffix)
+{
+   unsigned i, j;
+   const char *sep = "";
+
+   printf("%s", prefix);
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         printf("%s0x%08x", sep, unpacked[i][j]);
+         sep = ", ";
+      }
+   }
+   printf("%s", suffix);
+}
+
+
+static void
+print_unpacked_s_8uscaled(const struct util_format_description *format_desc,
+                          const char *prefix,
+                          uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH],
+                          const char *suffix)
+{
+   unsigned i, j;
+   const char *sep = "";
+
+   printf("%s", prefix);
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         printf("%s0x%02x", sep, unpacked[i][j]);
+         sep = ", ";
+      }
+   }
+   printf("%s", suffix);
+}
+
+
 static boolean
 test_format_fetch_rgba_float(const struct util_format_description *format_desc,
                              const struct util_format_test_case *test)
@@ -153,8 +214,8 @@ test_format_fetch_rgba_float(const struct util_format_description *format_desc,
    }
 
    if (!success) {
-      print_unpacked_float(format_desc, "FAILED: ", unpacked, " obtained\n");
-      print_unpacked_doubl(format_desc, "        ", test->unpacked, " expected\n");
+      print_unpacked_rgba_float(format_desc, "FAILED: ", unpacked, " obtained\n");
+      print_unpacked_rgba_doubl(format_desc, "        ", test->unpacked, " expected\n");
    }
 
    return success;
@@ -185,8 +246,8 @@ test_format_unpack_rgba_float(const struct util_format_description *format_desc,
    }
 
    if (!success) {
-      print_unpacked_float(format_desc, "FAILED: ", unpacked, " obtained\n");
-      print_unpacked_doubl(format_desc, "        ", test->unpacked, " expected\n");
+      print_unpacked_rgba_float(format_desc, "FAILED: ", unpacked, " obtained\n");
+      print_unpacked_rgba_doubl(format_desc, "        ", test->unpacked, " expected\n");
    }
 
    return success;
@@ -194,7 +255,6 @@ test_format_unpack_rgba_float(const struct util_format_description *format_desc,
 
 
 static boolean
-
 test_format_pack_rgba_float(const struct util_format_description *format_desc,
                             const struct util_format_test_case *test)
 {
@@ -290,8 +350,8 @@ test_format_unpack_rgba_8unorm(const struct util_format_description *format_desc
    }
 
    if (!success) {
-      print_unpacked_8unorm(format_desc, "FAILED: ", unpacked, " obtained\n");
-      print_unpacked_8unorm(format_desc, "        ", expected, " expected\n");
+      print_unpacked_rgba_8unorm(format_desc, "FAILED: ", unpacked, " obtained\n");
+      print_unpacked_rgba_8unorm(format_desc, "        ", expected, " expected\n");
    }
 
    return success;
@@ -343,6 +403,223 @@ test_format_pack_rgba_8unorm(const struct util_format_description *format_desc,
 }
 
 
+static boolean
+test_format_unpack_z_float(const struct util_format_description *format_desc,
+                              const struct util_format_test_case *test)
+{
+   float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } };
+   unsigned i, j;
+   boolean success;
+
+   format_desc->unpack_z_float(&unpacked[0][0], sizeof unpacked[0],
+                               test->packed, 0,
+                               format_desc->block.width, format_desc->block.height);
+
+   success = TRUE;
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         if (!compare_float(test->unpacked[i][j][0], unpacked[i][j])) {
+            success = FALSE;
+         }
+      }
+   }
+
+   if (!success) {
+      print_unpacked_z_float(format_desc, "FAILED: ", unpacked, " obtained\n");
+      print_unpacked_rgba_doubl(format_desc, "        ", test->unpacked, " expected\n");
+   }
+
+   return success;
+}
+
+
+static boolean
+test_format_pack_z_float(const struct util_format_description *format_desc,
+                            const struct util_format_test_case *test)
+{
+   float unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH];
+   uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES];
+   unsigned i, j;
+   boolean success;
+
+   memset(packed, 0, sizeof packed);
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         unpacked[i][j] = (float) test->unpacked[i][j][0];
+         if (test->unpacked[i][j][1]) {
+            return TRUE;
+         }
+      }
+   }
+
+   format_desc->pack_z_float(packed, 0,
+                             &unpacked[0][0], sizeof unpacked[0],
+                             format_desc->block.width, format_desc->block.height);
+
+   success = TRUE;
+   for (i = 0; i < format_desc->block.bits/8; ++i)
+      if ((test->packed[i] & test->mask[i]) != (packed[i] & test->mask[i]))
+         success = FALSE;
+
+   if (!success) {
+      print_packed(format_desc, "FAILED: ", packed, " obtained\n");
+      print_packed(format_desc, "        ", test->packed, " expected\n");
+   }
+
+   return success;
+}
+
+
+static boolean
+test_format_unpack_z_32unorm(const struct util_format_description *format_desc,
+                               const struct util_format_test_case *test)
+{
+   uint32_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } };
+   uint32_t expected[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } };
+   unsigned i, j;
+   boolean success;
+
+   format_desc->unpack_z_32unorm(&unpacked[0][0], sizeof unpacked[0],
+                                 test->packed, 0,
+                                 format_desc->block.width, format_desc->block.height);
+
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         expected[i][j] = test->unpacked[i][j][0] * 0xffffffff;
+      }
+   }
+
+   success = TRUE;
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         if (expected[i][j] != unpacked[i][j]) {
+            success = FALSE;
+         }
+      }
+   }
+
+   if (!success) {
+      print_unpacked_z_32unorm(format_desc, "FAILED: ", unpacked, " obtained\n");
+      print_unpacked_z_32unorm(format_desc, "        ", expected, " expected\n");
+   }
+
+   return success;
+}
+
+
+static boolean
+test_format_pack_z_32unorm(const struct util_format_description *format_desc,
+                             const struct util_format_test_case *test)
+{
+   uint32_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH];
+   uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES];
+   unsigned i, j;
+   boolean success;
+
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         unpacked[i][j] = test->unpacked[i][j][0] * 0xffffffff;
+         if (test->unpacked[i][j][1]) {
+            return TRUE;
+         }
+      }
+   }
+
+   memset(packed, 0, sizeof packed);
+
+   format_desc->pack_z_32unorm(packed, 0,
+                               &unpacked[0][0], sizeof unpacked[0],
+                               format_desc->block.width, format_desc->block.height);
+
+   success = TRUE;
+   for (i = 0; i < format_desc->block.bits/8; ++i)
+      if ((test->packed[i] & test->mask[i]) != (packed[i] & test->mask[i]))
+         success = FALSE;
+
+   if (!success) {
+      print_packed(format_desc, "FAILED: ", packed, " obtained\n");
+      print_packed(format_desc, "        ", test->packed, " expected\n");
+   }
+
+   return success;
+}
+
+
+static boolean
+test_format_unpack_s_8uscaled(const struct util_format_description *format_desc,
+                               const struct util_format_test_case *test)
+{
+   uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } };
+   uint8_t expected[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH] = { { 0 } };
+   unsigned i, j;
+   boolean success;
+
+   format_desc->unpack_s_8uscaled(&unpacked[0][0], sizeof unpacked[0],
+                                  test->packed, 0,
+                                  format_desc->block.width, format_desc->block.height);
+
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         expected[i][j] = test->unpacked[i][j][1];
+      }
+   }
+
+   success = TRUE;
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         if (expected[i][j] != unpacked[i][j]) {
+            success = FALSE;
+         }
+      }
+   }
+
+   if (!success) {
+      print_unpacked_s_8uscaled(format_desc, "FAILED: ", unpacked, " obtained\n");
+      print_unpacked_s_8uscaled(format_desc, "        ", expected, " expected\n");
+   }
+
+   return success;
+}
+
+
+static boolean
+test_format_pack_s_8uscaled(const struct util_format_description *format_desc,
+                             const struct util_format_test_case *test)
+{
+   uint8_t unpacked[UTIL_FORMAT_MAX_UNPACKED_HEIGHT][UTIL_FORMAT_MAX_UNPACKED_WIDTH];
+   uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES];
+   unsigned i, j;
+   boolean success;
+
+   for (i = 0; i < format_desc->block.height; ++i) {
+      for (j = 0; j < format_desc->block.width; ++j) {
+         unpacked[i][j] = test->unpacked[i][j][1];
+         if (test->unpacked[i][j][0]) {
+            return TRUE;
+         }
+      }
+   }
+
+   memset(packed, 0, sizeof packed);
+
+   format_desc->pack_s_8uscaled(packed, 0,
+                                &unpacked[0][0], sizeof unpacked[0],
+                                format_desc->block.width, format_desc->block.height);
+
+   success = TRUE;
+   for (i = 0; i < format_desc->block.bits/8; ++i)
+      if ((test->packed[i] & test->mask[i]) != (packed[i] & test->mask[i]))
+         success = FALSE;
+
+   if (!success) {
+      print_packed(format_desc, "FAILED: ", packed, " obtained\n");
+      print_packed(format_desc, "        ", test->packed, " expected\n");
+   }
+
+   return success;
+}
+
+
 typedef boolean
 (*test_func_t)(const struct util_format_description *format_desc,
                const struct util_format_test_case *test);
@@ -405,6 +682,13 @@ test_all(void)
       TEST_ONE_FUNC(pack_rgba_8unorm);
       TEST_ONE_FUNC(unpack_rgba_8unorm);
 
+      TEST_ONE_FUNC(unpack_z_32unorm);
+      TEST_ONE_FUNC(pack_z_32unorm);
+      TEST_ONE_FUNC(unpack_z_float);
+      TEST_ONE_FUNC(pack_z_float);
+      TEST_ONE_FUNC(unpack_s_8uscaled);
+      TEST_ONE_FUNC(pack_s_8uscaled);
+
 #     undef TEST_ONE_FUNC
    }
 
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index c96a39dc096..5e3dc694be6 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -286,9 +286,9 @@ struct util_format_description
     * Only defined for stencil formats.
     */
    void
-   (*unpack_s_32unorm)(uint8_t *dst, unsigned dst_stride,
-                       const uint8_t *src, unsigned src_stride,
-                       unsigned width, unsigned height);
+   (*unpack_s_8uscaled)(uint8_t *dst, unsigned dst_stride,
+                        const uint8_t *src, unsigned src_stride,
+                        unsigned width, unsigned height);
 
    /**
     * Pack pixels from S8_USCALED.
diff --git a/src/gallium/auxiliary/util/u_format_tests.c b/src/gallium/auxiliary/util/u_format_tests.c
index 60cc91bd8b7..b1df6c49c42 100644
--- a/src/gallium/auxiliary/util/u_format_tests.c
+++ b/src/gallium/auxiliary/util/u_format_tests.c
@@ -334,9 +334,41 @@ util_format_test_cases[] =
    {PIPE_FORMAT_R8G8Bx_SNORM, PACKED_2x8(0xff, 0xff), PACKED_2x8(0x00, 0x81), UNPACKED_1x1( 0.0, -1.0, 0.0, 1.0)},
 
    /*
-    * TODO: Depth-stencil formats
+    * Depth-stencil formats
     */
 
+   {PIPE_FORMAT_S8_USCALED, PACKED_1x8(0xff), PACKED_1x8(0x00), UNPACKED_1x1(0.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_S8_USCALED, PACKED_1x8(0xff), PACKED_1x8(0xff), UNPACKED_1x1(0.0, 255.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0x0000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z16_UNORM, PACKED_1x16(0xffff), PACKED_1x16(0xffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z32_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z32_FLOAT, PACKED_1x32(0xffffffff), PACKED_1x32(0x3f800000), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z24_UNORM_S8_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z24_UNORM_S8_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0x00ffffff), UNPACKED_1x1(1.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z24_UNORM_S8_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0xff000000), UNPACKED_1x1(0.0, 255.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z24_UNORM_S8_USCALED, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 255.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_S8_USCALED_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_S8_USCALED_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffff00), UNPACKED_1x1(1.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_S8_USCALED_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0x000000ff), UNPACKED_1x1(0.0, 255.0, 0.0, 0.0)},
+   {PIPE_FORMAT_S8_USCALED_Z24_UNORM, PACKED_1x32(0xffffffff), PACKED_1x32(0xffffffff), UNPACKED_1x1(1.0, 255.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z24X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z24X8_UNORM, PACKED_1x32(0x00ffffff), PACKED_1x32(0x00ffffff), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_X8Z24_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0x00000000), UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_X8Z24_UNORM, PACKED_1x32(0xffffff00), PACKED_1x32(0xffffff00), UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
+
+   {PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED, PACKED_2x32(0xffffffff, 0x000000ff), PACKED_2x32(0x00000000, 0x00000000), UNPACKED_1x1( 0.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED, PACKED_2x32(0xffffffff, 0x000000ff), PACKED_2x32(0x3f800000, 0x00000000), UNPACKED_1x1( 1.0,   0.0, 0.0, 0.0)},
+   {PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED, PACKED_2x32(0xffffffff, 0x000000ff), PACKED_2x32(0x00000000, 0x000000ff), UNPACKED_1x1( 0.0, 255.0, 0.0, 0.0)},
+
    /*
     * YUV formats
     */
@@ -910,7 +942,6 @@ util_format_test_cases[] =
    {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00000000, 0x00000000, 0x00000000, 0xffff0000), UNPACKED_1x1( 0.0,  0.0,  0.0, -1.0)},
    {PIPE_FORMAT_R32G32B32A32_FIXED, PACKED_4x32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), PACKED_4x32(0x00010000, 0x00010000, 0x00010000, 0x00010000), UNPACKED_1x1( 1.0,  1.0,  1.0,  1.0)},
 
-
    /*
     * D3D9 specific vertex formats
     */
diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c
index 68c88e520c9..792d69c214c 100644
--- a/src/gallium/auxiliary/util/u_format_zs.c
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -31,6 +31,86 @@
 #include "u_format_zs.h"
 
 
+/*
+ * z32_unorm conversion functions
+ */
+
+static INLINE uint16_t
+z32_unorm_to_z16_unorm(uint32_t z)
+{
+   /* z * 0xffff / 0xffffffff */
+   return z >> 16;
+}
+
+static INLINE uint32_t
+z16_unorm_to_z32_unorm(uint16_t z)
+{
+   /* z * 0xffffffff / 0xffff */
+   return (z << 16) | z;
+}
+
+static INLINE uint32_t
+z32_unorm_to_z24_unorm(uint32_t z)
+{
+   /* z * 0xffffff / 0xffffffff */
+   return z >> 8;
+}
+
+static INLINE uint32_t
+z24_unorm_to_z32_unorm(uint32_t z)
+{
+   /* z * 0xffffffff / 0xffffff */
+   return (z << 8) | (z >> 16);
+}
+
+
+/*
+ * z32_float conversion functions
+ */
+
+static INLINE uint16_t
+z32_float_to_z16_unorm(float z)
+{
+   const float scale = 0xffff;
+   return (uint16_t)(z * scale);
+}
+
+static INLINE float
+z16_unorm_to_z32_float(uint16_t z)
+{
+   const float scale = 1.0 / 0xffff;
+   return (float)(z * scale);
+}
+
+static INLINE uint32_t
+z32_float_to_z24_unorm(float z)
+{
+   const double scale = 0xffffff;
+   return (uint32_t)(z * scale) & 0xffffff;
+}
+
+static INLINE float
+z24_unorm_to_z32_float(uint32_t z)
+{
+   const double scale = 1.0 / 0xffffff;
+   return (float)(z * scale);
+}
+
+static INLINE uint32_t
+z32_float_to_z32_unorm(float z)
+{
+   const double scale = 0xffffffff;
+   return (uint32_t)(z * scale);
+}
+
+static INLINE float
+z32_unorm_to_z32_float(uint32_t z)
+{
+   const double scale = 1.0 / 0xffffffff;
+   return (float)(z * scale);
+}
+
+
 void
 util_format_s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
                                          const uint8_t *src_row, unsigned src_stride,
@@ -71,8 +151,7 @@ util_format_z16_unorm_unpack_z_float(float *dst_row, unsigned dst_stride,
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap16(value);
 #endif
-         dst[0] = (float)(value * (1.0f/0xffff));
-         dst += 1;
+         *dst++ = z16_unorm_to_z32_float(value);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -90,12 +169,11 @@ util_format_z16_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
       uint16_t *dst = (uint16_t *)dst_row;
       for(x = 0; x < width; ++x) {
          uint16_t value;
-         value = (uint16_t)(*src * 0xffff);
+         value = z32_float_to_z16_unorm(*src++);
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap16(value);
 #endif
          *dst++ = value;
-         src += 1;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -116,8 +194,7 @@ util_format_z16_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap16(value);
 #endif
-         /* value * 0xffffffff / 0xffff */
-         *dst++ = (value << 16) | value;
+         *dst++ = z16_unorm_to_z32_unorm(value);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -135,7 +212,7 @@ util_format_z16_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
       uint16_t *dst = (uint16_t *)dst_row;
       for(x = 0; x < width; ++x) {
          uint16_t value;
-         value = (uint16_t)(*src++ >> 16);
+         value = z32_unorm_to_z16_unorm(*src++);
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap16(value);
 #endif
@@ -160,7 +237,7 @@ util_format_z32_unorm_unpack_z_float(float *dst_row, unsigned dst_stride,
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         *dst++ = (float)(value * (1.0/0xffffffff));
+         *dst++ = z32_unorm_to_z32_float(value);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -178,12 +255,11 @@ util_format_z32_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
       uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
          uint32_t value;
-         value = (uint32_t)(*src * (double)0xffffffff);
+         value = z32_float_to_z32_unorm(*src++);
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
          *dst++ = value;
-         ++src;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -252,7 +328,7 @@ util_format_z32_float_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
       uint32_t *dst = dst_row;
       const float *src = (const float *)src_row;
       for(x = 0; x < width; ++x) {
-         *dst++ = (uint32_t)(*src++ * (double)0xffffffff);
+         *dst++ = z32_float_to_z32_unorm(*src++);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -269,7 +345,7 @@ util_format_z32_float_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
       const uint32_t *src = src_row;
       float *dst = (float *)dst_row;
       for(x = 0; x < width; ++x) {
-         *dst++ = (float)(*src++ * (1.0/0xffffffff));
+         *dst++ = z32_unorm_to_z32_float(*src++);
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -287,12 +363,10 @@ util_format_z24_unorm_s8_uscaled_unpack_z_float(float *dst_row, unsigned dst_str
       const uint32_t *src = (const uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
          uint32_t value = *src++;
-         uint32_t z;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         z = (value) & 0xffffff;
-         *dst++ = (float)(z * (1.0/0xffffff));
+         *dst++ = z24_unorm_to_z32_float(value & 0xffffff);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -314,7 +388,7 @@ util_format_z24_unorm_s8_uscaled_pack_z_float(uint8_t *dst_row, unsigned dst_str
          value = util_bswap32(value);
 #endif
          value &= 0xff000000;
-         value |= ((uint32_t)(*src++ * (double)0xffffff)) & 0xffffff;
+         value |= z32_float_to_z24_unorm(*src++);
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
@@ -336,12 +410,10 @@ util_format_z24_unorm_s8_uscaled_unpack_z_32unorm(uint32_t *dst_row, unsigned ds
       const uint32_t *src = (const uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
          uint32_t value = *src++;
-         uint32_t z;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         z = value & 0xffffff;
-         *dst++ = (z << 8) | (z >> 16); /* z * 0xffffffff / 0xffffff */;
+         *dst++ = z24_unorm_to_z32_unorm(value & 0xffffff);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -358,15 +430,16 @@ util_format_z24_unorm_s8_uscaled_pack_z_32unorm(uint8_t *dst_row, unsigned dst_s
       const uint32_t *src = src_row;
       uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value;
-         value = ((uint32_t)(*src >> 8)) & 0xffffff;
-         value = ((uint32_t)(((uint64_t)src[1]) * 0x1 / 0xffffffff)) << 24;
+         uint32_t value= *dst;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         value &= 0xff000000;
+         value |= z32_unorm_to_z24_unorm(*src++);
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
          *dst++ = value;
-         src += 1;
-         dst += 4;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -381,19 +454,13 @@ util_format_z24_unorm_s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned ds
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       uint8_t *dst = dst_row;
-      const uint8_t *src = src_row;
+      const uint32_t *src = (const uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value = *(const uint32_t *)src;
-         uint32_t z;
-         uint32_t s;
+         uint32_t value = *src++;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         z = (value) & 0xffffff;
-         s = value >> 24;
-         dst[1] = s;
-         src += 4;
-         dst += 1;
+         *dst++ = value >> 24;
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -408,17 +475,18 @@ util_format_z24_unorm_s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       const uint8_t *src = src_row;
-      uint8_t *dst = dst_row;
+      uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value;
-         value = ((uint32_t)(((uint32_t)MIN2(*src, 1)) * 0xffffff / 0x1)) & 0xffffff;
-         value = (src[1]) << 24;
+         uint32_t value = *dst;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         *(uint32_t *)dst = value;
-         src += 1;
-         dst += 4;
+         value &= 0x00ffffff;
+         value |= *src++ << 24;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -433,19 +501,13 @@ util_format_s8_uscaled_z24_unorm_unpack_z_float(float *dst_row, unsigned dst_str
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       float *dst = dst_row;
-      const uint8_t *src = src_row;
+      const uint32_t *src = (const uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value = *(const uint32_t *)src;
-         uint32_t s;
-         uint32_t z;
+         uint32_t value = *src++;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         s = (value) & 0xff;
-         z = value >> 8;
-         dst[0] = (float)(z * (1.0/0xffffff));
-         src += 4;
-         dst += 1;
+         *dst++ = z24_unorm_to_z32_float(value >> 8);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -460,17 +522,18 @@ util_format_s8_uscaled_z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_str
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       const float *src = src_row;
-      uint8_t *dst = dst_row;
+      uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value;
-         value = ((uint32_t)CLAMP(src[1], 0, 255)) & 0xff;
-         value = ((uint32_t)(*src * (double)0xffffff)) << 8;
+         uint32_t value = *dst;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         *(uint32_t *)dst = value;
-         src += 1;
-         dst += 4;
+         value &= 0x000000ff;
+         value |= z32_float_to_z24_unorm(*src++) << 8;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -485,19 +548,13 @@ util_format_s8_uscaled_z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned ds
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       uint32_t *dst = dst_row;
-      const uint8_t *src = src_row;
+      const uint32_t *src = (const uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value = *(const uint32_t *)src;
-         uint32_t s;
-         uint32_t z;
+         uint32_t value = *src++;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         s = (value) & 0xff;
-         z = value >> 8;
-         dst[0] = (uint32_t)(((uint64_t)z) * 0xffffffff / 0xffffff);
-         src += 4;
-         dst += 1;
+         *dst++ = z24_unorm_to_z32_unorm(value >> 8);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -512,17 +569,18 @@ util_format_s8_uscaled_z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_s
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       const uint32_t *src = src_row;
-      uint8_t *dst = dst_row;
+      uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value;
-         value = ((uint32_t)(((uint64_t)src[1]) * 0x1 / 0xffffffff)) & 0xff;
-         value = ((uint32_t)(*src >> 8)) << 8;
+         uint32_t value = *dst;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         *(uint32_t *)dst = value;
-         src += 1;
-         dst += 4;
+         value &= 0x000000ff;
+         value |= *src++ & 0xffffff00;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -537,19 +595,13 @@ util_format_s8_uscaled_z24_unorm_unpack_s_8uscaled(uint8_t *dst_row, unsigned ds
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       uint8_t *dst = dst_row;
-      const uint8_t *src = src_row;
+      const uint32_t *src = (const uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value = *(const uint32_t *)src;
-         uint32_t s;
-         uint32_t z;
+         uint32_t value = *src++;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         s = (value) & 0xff;
-         z = value >> 8;
-         dst[1] = s;
-         src += 4;
-         dst += 1;
+         *dst++ = value & 0xff;
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -564,17 +616,18 @@ util_format_s8_uscaled_z24_unorm_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       const uint8_t *src = src_row;
-      uint8_t *dst = dst_row;
+      uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value;
-         value = (src[1]) & 0xff;
-         value = ((uint32_t)(((uint32_t)MIN2(*src, 1)) * 0xffffff / 0x1)) << 8;
+         uint32_t value = *dst;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         *(uint32_t *)dst = value;
-         src += 1;
-         dst += 4;
+         value &= 0xffffff00;
+         value |= *src++;
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         value = util_bswap32(value);
+#endif
+         *dst++ = value;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -589,17 +642,13 @@ util_format_z24x8_unorm_unpack_z_float(float *dst_row, unsigned dst_stride,
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       float *dst = dst_row;
-      const uint8_t *src = src_row;
+      const uint32_t *src = (const uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value = *(const uint32_t *)src;
-         uint32_t z;
+         uint32_t value = *src++;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         z = (value) & 0xffffff;
-         dst[0] = (float)(z * (1.0/0xffffff));
-         src += 4;
-         dst += 1;
+         *dst++ = z24_unorm_to_z32_float(value & 0xffffff);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -614,16 +663,14 @@ util_format_z24x8_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       const float *src = src_row;
-      uint8_t *dst = dst_row;
+      uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
          uint32_t value;
-         value = ((uint32_t)(*src * (double)0xffffff)) & 0xffffff;
+         value = z32_float_to_z24_unorm(*src++);
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         *(uint32_t *)dst = value;
-         src += 1;
-         dst += 4;
+         *dst++ = value;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -638,17 +685,13 @@ util_format_z24x8_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       uint32_t *dst = dst_row;
-      const uint8_t *src = src_row;
+      const uint32_t *src = (const uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value = *(const uint32_t *)src;
-         uint32_t z;
+         uint32_t value = *src++;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         z = (value) & 0xffffff;
-         dst[0] = (uint32_t)(((uint64_t)z) * 0xffffffff / 0xffffff);
-         src += 4;
-         dst += 1;
+         *dst++ = z24_unorm_to_z32_unorm(value & 0xffffff);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -663,16 +706,14 @@ util_format_z24x8_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       const uint32_t *src = src_row;
-      uint8_t *dst = dst_row;
+      uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
          uint32_t value;
-         value = ((uint32_t)(*src >> 8)) & 0xffffff;
+         value = z32_unorm_to_z24_unorm(*src++);
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         *(uint32_t *)dst = value;
-         src += 1;
-         dst += 4;
+         *dst++ = value;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -687,17 +728,13 @@ util_format_x8z24_unorm_unpack_z_float(float *dst_row, unsigned dst_stride,
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       float *dst = dst_row;
-      const uint8_t *src = src_row;
+      const uint32_t *src = (uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value = *(const uint32_t *)src;
-         uint32_t z;
+         uint32_t value = *src++;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         z = value >> 8;
-         dst[0] = (float)(z * (1.0/0xffffff));
-         src += 4;
-         dst += 1;
+         *dst++ = z24_unorm_to_z32_float(value >> 8);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -712,16 +749,14 @@ util_format_x8z24_unorm_pack_z_float(uint8_t *dst_row, unsigned dst_stride,
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       const float *src = src_row;
-      uint8_t *dst = dst_row;
+      uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
          uint32_t value;
-         value = ((uint32_t)(*src * (double)0xffffff)) << 8;
+         value = z32_float_to_z24_unorm(*src++) << 8;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         *(uint32_t *)dst = value;
-         src += 1;
-         dst += 4;
+         *dst++ = value;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -736,17 +771,13 @@ util_format_x8z24_unorm_unpack_z_32unorm(uint32_t *dst_row, unsigned dst_stride,
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       uint32_t *dst = dst_row;
-      const uint8_t *src = src_row;
+      const uint32_t *src = (const uint32_t *)src_row;
       for(x = 0; x < width; ++x) {
-         uint32_t value = *(const uint32_t *)src;
-         uint32_t z;
+         uint32_t value = *src++;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         z = value >> 8;
-         dst[0] = (uint32_t)(((uint64_t)z) * 0xffffffff / 0xffffff);
-         src += 4;
-         dst += 1;
+         *dst++ = z24_unorm_to_z32_unorm(value >> 8);
       }
       src_row += src_stride/sizeof(*src_row);
       dst_row += dst_stride/sizeof(*dst_row);
@@ -761,16 +792,14 @@ util_format_x8z24_unorm_pack_z_32unorm(uint8_t *dst_row, unsigned dst_stride,
    unsigned x, y;
    for(y = 0; y < height; ++y) {
       const uint32_t *src = src_row;
-      uint8_t *dst = dst_row;
+      uint32_t *dst = (uint32_t *)dst_row;
       for(x = 0; x < width; ++x) {
          uint32_t value;
-         value = ((uint32_t)(*src >> 8)) << 8;
+         value = z32_unorm_to_z24_unorm(*src++) << 8;
 #ifdef PIPE_ARCH_BIG_ENDIAN
          value = util_bswap32(value);
 #endif
-         *(uint32_t *)dst = value;
-         src += 1;
-         dst += 4;
+         *dst++ = value;
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);
@@ -825,7 +854,7 @@ util_format_z32_float_s8x24_uscaled_unpack_z_32unorm(uint32_t *dst_row, unsigned
       uint32_t *dst = dst_row;
       const float *src = (const float *)src_row;
       for(x = 0; x < width; ++x) {
-         *dst = (uint32_t)(*src * (double)0xffffffff);
+         *dst = z32_float_to_z32_unorm(*src);
          src += 2;
          dst += 1;
       }
@@ -844,9 +873,7 @@ util_format_z32_float_s8x24_uscaled_pack_z_32unorm(uint8_t *dst_row, unsigned ds
       const uint32_t *src = src_row;
       float *dst = (float *)dst_row;
       for(x = 0; x < width; ++x) {
-         *dst = (float)(*src * (1.0/0xffffffff));
-         src += 2;
-         dst += 1;
+         *dst++ = z32_unorm_to_z32_float(*src++);
       }
       dst_row += dst_stride/sizeof(*dst_row);
       src_row += src_stride/sizeof(*src_row);