target.c (struct gomp_coalesce_buf): New type.
authorJakub Jelinek <jakub@redhat.com>
Sat, 28 Oct 2017 07:02:39 +0000 (09:02 +0200)
committerJakub Jelinek <jakub@gcc.gnu.org>
Sat, 28 Oct 2017 07:02:39 +0000 (09:02 +0200)
* target.c (struct gomp_coalesce_buf): New type.
(MAX_COALESCE_BUF_SIZE, MAX_COALESCE_BUF_GAP): Define.
(gomp_coalesce_buf_add, gomp_to_device_kind_p): New functions.
(gomp_copy_host2dev): Add CBUF argument, if copying into
the cached ranges, memcpy into buffer instead of copying
into device.
(gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
Add CBUF argument, pass it through to other calls.
(gomp_map_vars): Aggregate copies from host to device if small enough
and with small enough gaps in between into memcpy into a buffer and
fewer host to device copies from the buffer.
(gomp_update): Adjust gomp_copy_host2dev caller.

From-SVN: r254194

libgomp/ChangeLog
libgomp/target.c

index 35a2374d0b336a13b5846d7ee6815b5d51b14761..39e98c735c7f33498cc372d680a601dc12f1996e 100644 (file)
@@ -1,3 +1,18 @@
+2017-10-28  Jakub Jelinek  <jakub@redhat.com>
+
+       * target.c (struct gomp_coalesce_buf): New type.
+       (MAX_COALESCE_BUF_SIZE, MAX_COALESCE_BUF_GAP): Define.
+       (gomp_coalesce_buf_add, gomp_to_device_kind_p): New functions.
+       (gomp_copy_host2dev): Add CBUF argument, if copying into
+       the cached ranges, memcpy into buffer instead of copying
+       into device.
+       (gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
+       Add CBUF argument, pass it through to other calls.
+       (gomp_map_vars): Aggregate copies from host to device if small enough
+       and with small enough gaps in between into memcpy into a buffer and
+       fewer host to device copies from the buffer.
+       (gomp_update): Adjust gomp_copy_host2dev caller.
+
 2017-10-17  Thomas Schwinge  <thomas@codesourcery.com>
 
        * testsuite/libgomp.oacc-fortran/declare-1.f90: Restore "dg-do
index 3dd119f52e5d77ec8110f1b7b9b689caeb96c489..8ac05e8c6415c692a3ed55d13877cb436318d2e5 100644 (file)
@@ -177,10 +177,122 @@ gomp_device_copy (struct gomp_device_descr *devicep,
     }
 }
 
+/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
+   host to device memory transfers.  */
+
+struct gomp_coalesce_buf
+{
+  /* Buffer into which gomp_copy_host2dev will memcpy data and from which
+     it will be copied to the device.  */
+  void *buf;
+  struct target_mem_desc *tgt;
+  /* Array with offsets, chunks[2 * i] is the starting offset and
+     chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address
+     of chunks which are to be copied to buf and later copied to device.  */
+  size_t *chunks;
+  /* Number of chunks in chunks array, or -1 if coalesce buffering should not
+     be performed.  */
+  long chunk_cnt;
+  /* During construction of chunks array, how many memory regions are within
+     the last chunk.  If there is just one memory region for a chunk, we copy
+     it directly to device rather than going through buf.  */
+  long use_cnt;
+};
+
+/* Maximum size of memory region considered for coalescing.  Larger copies
+   are performed directly.  */
+#define MAX_COALESCE_BUF_SIZE  (32 * 1024)
+
+/* Maximum size of a gap in between regions to consider them being copied
+   within the same chunk.  All the device offsets considered are within
+   newly allocated device memory, so it isn't fatal if we copy some padding
+   in between from host to device.  The gaps come either from alignment
+   padding or from memory regions which are not supposed to be copied from
+   host to device (e.g. map(alloc:), map(from:) etc.).  */
+#define MAX_COALESCE_BUF_GAP   (4 * 1024)
+
+/* Add region with device tgt_start relative offset and length to CBUF.  */
+
+static inline void
+gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t len)
+{
+  if (len > MAX_COALESCE_BUF_SIZE || len == 0)
+    return;
+  if (cbuf->chunk_cnt)
+    {
+      if (cbuf->chunk_cnt < 0)
+       return;
+      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+       {
+         cbuf->chunk_cnt = -1;
+         return;
+       }
+      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP)
+       {
+         cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
+         cbuf->use_cnt++;
+         return;
+       }
+      /* If the last chunk is only used by one mapping, discard it,
+        as it will be one host to device copy anyway and
+        memcpying it around will only waste cycles.  */
+      if (cbuf->use_cnt == 1)
+       cbuf->chunk_cnt--;
+    }
+  cbuf->chunks[2 * cbuf->chunk_cnt] = start;
+  cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
+  cbuf->chunk_cnt++;
+  cbuf->use_cnt = 1;
+}
+
+/* Return true for mapping kinds which need to copy data from the
+   host to device for regions that weren't previously mapped.  */
+
+static inline bool
+gomp_to_device_kind_p (int kind)
+{
+  switch (kind)
+    {
+    case GOMP_MAP_ALLOC:
+    case GOMP_MAP_FROM:
+    case GOMP_MAP_FORCE_ALLOC:
+    case GOMP_MAP_ALWAYS_FROM:
+      return false;
+    default:
+      return true;
+    }
+}
+
 static void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
-                   void *d, const void *h, size_t sz)
+                   void *d, const void *h, size_t sz,
+                   struct gomp_coalesce_buf *cbuf)
 {
+  if (cbuf)
+    {
+      uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
+      if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+       {
+         long first = 0;
+         long last = cbuf->chunk_cnt - 1;
+         while (first <= last)
+           {
+             long middle = (first + last) >> 1;
+             if (cbuf->chunks[2 * middle + 1] <= doff)
+               first = middle + 1;
+             else if (cbuf->chunks[2 * middle] <= doff)
+               {
+                 if (doff + sz > cbuf->chunks[2 * middle + 1])
+                   gomp_fatal ("internal libgomp cbuf error");
+                 memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
+                         h, sz);
+                 return;
+               }
+             else
+               last = middle - 1;
+           }
+       }
+    }
   gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
 }
 
@@ -208,7 +320,7 @@ gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr)
 static inline void
 gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
                        splay_tree_key newn, struct target_var_desc *tgt_var,
-                       unsigned char kind)
+                       unsigned char kind, struct gomp_coalesce_buf *cbuf)
 {
   tgt_var->key = oldn;
   tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
@@ -232,7 +344,7 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
                        (void *) (oldn->tgt->tgt_start + oldn->tgt_offset
                                  + newn->host_start - oldn->host_start),
                        (void *) newn->host_start,
-                       newn->host_end - newn->host_start);
+                       newn->host_end - newn->host_start, cbuf);
 
   if (oldn->refcount != REFCOUNT_INFINITY)
     oldn->refcount++;
@@ -247,7 +359,8 @@ get_kind (bool short_mapkind, void *kinds, int idx)
 
 static void
 gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
-                 uintptr_t target_offset, uintptr_t bias)
+                 uintptr_t target_offset, uintptr_t bias,
+                 struct gomp_coalesce_buf *cbuf)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -257,11 +370,10 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
   if (cur_node.host_start == (uintptr_t) NULL)
     {
       cur_node.tgt_offset = (uintptr_t) NULL;
-      /* FIXME: see comment about coalescing host/dev transfers below.  */
       gomp_copy_host2dev (devicep,
                          (void *) (tgt->tgt_start + target_offset),
                          (void *) &cur_node.tgt_offset,
-                         sizeof (void *));
+                         sizeof (void *), cbuf);
       return;
     }
   /* Add bias to the pointer value.  */
@@ -280,15 +392,15 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
      array section.  Now subtract bias to get what we want
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
-  /* FIXME: see comment about coalescing host/dev transfers below.  */
   gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
-                     (void *) &cur_node.tgt_offset, sizeof (void *));
+                     (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
 }
 
 static void
 gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
                          size_t first, size_t i, void **hostaddrs,
-                         size_t *sizes, void *kinds)
+                         size_t *sizes, void *kinds,
+                         struct gomp_coalesce_buf *cbuf)
 {
   struct gomp_device_descr *devicep = tgt->device_descr;
   struct splay_tree_s *mem_map = &devicep->mem_map;
@@ -306,7 +418,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
       && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
     {
       gomp_map_vars_existing (devicep, n2, &cur_node,
-                             &tgt->list[i], kind & typemask);
+                             &tgt->list[i], kind & typemask, cbuf);
       return;
     }
   if (sizes[i] == 0)
@@ -322,7 +434,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
                 == n2->tgt_offset - n->tgt_offset)
            {
              gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-                                     kind & typemask);
+                                     kind & typemask, cbuf);
              return;
            }
        }
@@ -334,7 +446,7 @@ gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
          && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
        {
          gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
-                                 kind & typemask);
+                                 kind & typemask, cbuf);
          return;
        }
     }
@@ -381,6 +493,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
   tgt->list_count = mapnum;
   tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
   tgt->device_descr = devicep;
+  struct gomp_coalesce_buf cbuf, *cbufp = NULL;
 
   if (mapnum == 0)
     {
@@ -391,11 +504,25 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 
   tgt_align = sizeof (void *);
   tgt_size = 0;
+  cbuf.chunks = NULL;
+  cbuf.chunk_cnt = -1;
+  cbuf.use_cnt = 0;
+  cbuf.buf = NULL;
+  if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
+    {
+      cbuf.chunks
+       = (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
+      cbuf.chunk_cnt = 0;
+    }
   if (pragma_kind == GOMP_MAP_VARS_TARGET)
     {
       size_t align = 4 * sizeof (void *);
       tgt_align = align;
       tgt_size = mapnum * sizeof (void *);
+      cbuf.chunk_cnt = 1;
+      cbuf.use_cnt = 1 + (mapnum > 1);
+      cbuf.chunks[0] = 0;
+      cbuf.chunks[1] = tgt_size;
     }
 
   gomp_mutex_lock (&devicep->lock);
@@ -449,19 +576,26 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
              size_t align = (size_t) 1 << (kind >> rshift);
              if (tgt_align < align)
                tgt_align = align;
-             tgt_size -= (uintptr_t) hostaddrs[first]
-                         - (uintptr_t) hostaddrs[i];
+             tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
              tgt_size = (tgt_size + align - 1) & ~(align - 1);
-             tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
+             tgt_size += cur_node.host_end - cur_node.host_start;
              not_found_cnt += last - i;
              for (i = first; i <= last; i++)
-               tgt->list[i].key = NULL;
+               {
+                 tgt->list[i].key = NULL;
+                 if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
+                                            & typemask))
+                   gomp_coalesce_buf_add (&cbuf,
+                                          tgt_size - cur_node.host_end
+                                          + (uintptr_t) hostaddrs[i],
+                                          sizes[i]);
+               }
              i--;
              continue;
            }
          for (i = first; i <= last; i++)
            gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
-                                     sizes, kinds);
+                                     sizes, kinds, NULL);
          i--;
          continue;
        }
@@ -485,6 +619,8 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
          if (tgt_align < align)
            tgt_align = align;
          tgt_size = (tgt_size + align - 1) & ~(align - 1);
+         gomp_coalesce_buf_add (&cbuf, tgt_size,
+                                cur_node.host_end - cur_node.host_start);
          tgt_size += cur_node.host_end - cur_node.host_start;
          has_firstprivate = true;
          continue;
@@ -504,7 +640,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
        n = splay_tree_lookup (mem_map, &cur_node);
       if (n && n->refcount != REFCOUNT_LINK)
        gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
-                               kind & typemask);
+                               kind & typemask, NULL);
       else
        {
          tgt->list[i].key = NULL;
@@ -514,6 +650,9 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
          if (tgt_align < align)
            tgt_align = align;
          tgt_size = (tgt_size + align - 1) & ~(align - 1);
+         if (gomp_to_device_kind_p (kind & typemask))
+           gomp_coalesce_buf_add (&cbuf, tgt_size,
+                                  cur_node.host_end - cur_node.host_start);
          tgt_size += cur_node.host_end - cur_node.host_start;
          if ((kind & typemask) == GOMP_MAP_TO_PSET)
            {
@@ -562,6 +701,19 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
       tgt->tgt_start = (uintptr_t) tgt->to_free;
       tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
       tgt->tgt_end = tgt->tgt_start + tgt_size;
+
+      if (cbuf.use_cnt == 1)
+       cbuf.chunk_cnt--;
+      if (cbuf.chunk_cnt > 0)
+       {
+         cbuf.buf
+           = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
+         if (cbuf.buf)
+           {
+             cbuf.tgt = tgt;
+             cbufp = &cbuf;
+           }
+       }
     }
   else
     {
@@ -600,7 +752,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
                len = sizes[i];
                gomp_copy_host2dev (devicep,
                                    (void *) (tgt->tgt_start + tgt_size),
-                                   (void *) hostaddrs[i], len);
+                                   (void *) hostaddrs[i], len, cbufp);
                tgt_size += len;
                continue;
              case GOMP_MAP_FIRSTPRIVATE_INT:
@@ -633,7 +785,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
                  }
                for (i = first; i <= last; i++)
                  gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
-                                           sizes, kinds);
+                                           sizes, kinds, cbufp);
                i--;
                continue;
              case GOMP_MAP_ALWAYS_POINTER:
@@ -658,7 +810,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
                                              + cur_node.host_start
                                              - n->host_start),
                                    (void *) &cur_node.tgt_offset,
-                                   sizeof (void *));
+                                   sizeof (void *), cbufp);
                cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
                                      + cur_node.host_start - n->host_start;
                continue;
@@ -674,7 +826,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
            splay_tree_key n = splay_tree_lookup (mem_map, k);
            if (n && n->refcount != REFCOUNT_LINK)
              gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
-                                     kind & typemask);
+                                     kind & typemask, cbufp);
            else
              {
                k->link_key = NULL;
@@ -725,26 +877,22 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
                  case GOMP_MAP_FORCE_TOFROM:
                  case GOMP_MAP_ALWAYS_TO:
                  case GOMP_MAP_ALWAYS_TOFROM:
-                   /* FIXME: Perhaps add some smarts, like if copying
-                      several adjacent fields from host to target, use some
-                      host buffer to avoid sending each var individually.  */
                    gomp_copy_host2dev (devicep,
                                        (void *) (tgt->tgt_start
                                                  + k->tgt_offset),
                                        (void *) k->host_start,
-                                       k->host_end - k->host_start);
+                                       k->host_end - k->host_start, cbufp);
                    break;
                  case GOMP_MAP_POINTER:
                    gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
-                                     k->tgt_offset, sizes[i]);
+                                     k->tgt_offset, sizes[i], cbufp);
                    break;
                  case GOMP_MAP_TO_PSET:
-                   /* FIXME: see above FIXME comment.  */
                    gomp_copy_host2dev (devicep,
                                        (void *) (tgt->tgt_start
                                                  + k->tgt_offset),
                                        (void *) k->host_start,
-                                       k->host_end - k->host_start);
+                                       k->host_end - k->host_start, cbufp);
 
                    for (j = i + 1; j < mapnum; j++)
                      if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
@@ -767,7 +915,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
                                            k->tgt_offset
                                            + ((uintptr_t) hostaddrs[j]
                                               - k->host_start),
-                                           sizes[j]);
+                                           sizes[j], cbufp);
                          i++;
                        }
                    break;
@@ -795,7 +943,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
                                        (void *) (tgt->tgt_start
                                                  + k->tgt_offset),
                                        (void *) k->host_start,
-                                       sizeof (void *));
+                                       sizeof (void *), cbufp);
                    break;
                  default:
                    gomp_mutex_unlock (&devicep->lock);
@@ -822,13 +970,23 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
       for (i = 0; i < mapnum; i++)
        {
          cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-         /* FIXME: see above FIXME comment.  */
          gomp_copy_host2dev (devicep,
                              (void *) (tgt->tgt_start + i * sizeof (void *)),
-                             (void *) &cur_node.tgt_offset, sizeof (void *));
+                             (void *) &cur_node.tgt_offset, sizeof (void *),
+                             cbufp);
        }
     }
 
+  if (cbufp)
+    {
+      long c = 0;
+      for (c = 0; c < cbuf.chunk_cnt; ++c)
+       gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]),
+                           (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]),
+                           cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
+      free (cbuf.buf);
+    }
+
   /* If the variable from "omp target enter data" map-list was already mapped,
      tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
      gomp_exit_data.  */
@@ -970,7 +1128,7 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
            size_t size = cur_node.host_end - cur_node.host_start;
 
            if (GOMP_MAP_COPY_TO_P (kind & typemask))
-             gomp_copy_host2dev (devicep, devaddr, hostaddr, size);
+             gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
            if (GOMP_MAP_COPY_FROM_P (kind & typemask))
              gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
          }