libgomp/team.c

   1 /* Copyright (C) 2005-2020 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson <rth@redhat.com>.
   3
   4    This file is part of the GNU Offloading and Multi Processing Library
   5    (libgomp).
   6
   7    Libgomp is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15    more details.
  16
  17    Under Section 7 of GPL version 3, you are granted additional
  18    permissions described in the GCC Runtime Library Exception, version
  19    3.1, as published by the Free Software Foundation.
  20
  21    You should have received a copy of the GNU General Public License and
  22    a copy of the GCC Runtime Library Exception along with this program;
  23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24    <http://www.gnu.org/licenses/>.  */
  25
  26 /* This file handles the maintenance of threads in response to team
  27    creation and termination.  */
  28
  29 #include "libgomp.h"
  30 #include "pool.h"
  31 #include <stdlib.h>
  32 #include <string.h>
  33
  34 #ifdef LIBGOMP_USE_PTHREADS
  35 pthread_attr_t gomp_thread_attr;
  36
  37 /* This key is for the thread destructor.  */
  38 pthread_key_t gomp_thread_destructor;
  39
  40
  41 /* This is the libgomp per-thread data structure.  */
  42 #if defined HAVE_TLS || defined USE_EMUTLS
  43 __thread struct gomp_thread gomp_tls_data;
  44 #else
  45 pthread_key_t gomp_tls_key;
  46 #endif
  47
  48
  49 /* This structure is used to communicate across pthread_create.  */
  50
  51 struct gomp_thread_start_data
  52 {
  53   void (*fn) (void *);
  54   void *fn_data;
  55   struct gomp_team_state ts;
  56   struct gomp_task *task;
  57   struct gomp_thread_pool *thread_pool;
  58   unsigned int place;
  59   bool nested;
  60   pthread_t handle;
  61 };
  62
  63
  64 /* This function is a pthread_create entry point.  This contains the idle
  65    loop in which a thread waits to be called up to become part of a team.  */
  66
  67 static void *
  68 gomp_thread_start (void *xdata)
  69 {
  70   struct gomp_thread_start_data *data = xdata;
  71   struct gomp_thread *thr;
  72   struct gomp_thread_pool *pool;
  73   void (*local_fn) (void *);
  74   void *local_data;
  75
  76 #if defined HAVE_TLS || defined USE_EMUTLS
  77   thr = &gomp_tls_data;
  78 #else
  79   struct gomp_thread local_thr;
  80   thr = &local_thr;
  81   pthread_setspecific (gomp_tls_key, thr);
  82 #endif
  83   gomp_sem_init (&thr->release, 0);
  84
  85   /* Extract what we need from data.  */
  86   local_fn = data->fn;
  87   local_data = data->fn_data;
  88   thr->thread_pool = data->thread_pool;
  89   thr->ts = data->ts;
  90   thr->task = data->task;
  91   thr->place = data->place;
  92 #ifdef GOMP_NEEDS_THREAD_HANDLE
  93   thr->handle = data->handle;
  94 #endif
  95
  96   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
  97
  98   /* Make thread pool local. */
  99   pool = thr->thread_pool;
 100
 101   if (data->nested)
 102     {
 103       struct gomp_team *team = thr->ts.team;
 104       struct gomp_task *task = thr->task;
 105
 106       gomp_barrier_wait (&team->barrier);
 107
 108       local_fn (local_data);
 109       gomp_team_barrier_wait_final (&team->barrier);
 110       gomp_finish_task (task);
 111       gomp_barrier_wait_last (&team->barrier);
 112     }
 113   else
 114     {
 115       pool->threads[thr->ts.team_id] = thr;
 116
 117       gomp_simple_barrier_wait (&pool->threads_dock);
 118       do
 119         {
 120           struct gomp_team *team = thr->ts.team;
 121           struct gomp_task *task = thr->task;
 122
 123           local_fn (local_data);
 124           gomp_team_barrier_wait_final (&team->barrier);
 125           gomp_finish_task (task);
 126
 127           gomp_simple_barrier_wait (&pool->threads_dock);
 128
 129           local_fn = thr->fn;
 130           local_data = thr->data;
 131           thr->fn = NULL;
 132         }
 133       while (local_fn);
 134     }
 135
 136   gomp_sem_destroy (&thr->release);
 137   pthread_detach (pthread_self ());
 138   thr->thread_pool = NULL;
 139   thr->task = NULL;
 140   return NULL;
 141 }
 142 #endif
 143
 144 static inline struct gomp_team *
 145 get_last_team (unsigned nthreads)
 146 {
 147   struct gomp_thread *thr = gomp_thread ();
 148   if (thr->ts.team == NULL)
 149     {
 150       struct gomp_thread_pool *pool = gomp_get_thread_pool (thr, nthreads);
 151       struct gomp_team *last_team = pool->last_team;
 152       if (last_team != NULL && last_team->nthreads == nthreads)
 153         {
 154           pool->last_team = NULL;
 155           return last_team;
 156         }
 157     }
 158   return NULL;
 159 }
 160
 161 /* Create a new team data structure.  */
 162
 163 struct gomp_team *
 164 gomp_new_team (unsigned nthreads)
 165 {
 166   struct gomp_team *team;
 167   int i;
 168
 169   team = get_last_team (nthreads);
 170   if (team == NULL)
 171     {
 172       size_t extra = sizeof (team->ordered_release[0])
 173                      + sizeof (team->implicit_task[0]);
 174       team = team_malloc (sizeof (*team) + nthreads * extra);
 175
 176 #ifndef HAVE_SYNC_BUILTINS
 177       gomp_mutex_init (&team->work_share_list_free_lock);
 178 #endif
 179       gomp_barrier_init (&team->barrier, nthreads);
 180       gomp_mutex_init (&team->task_lock);
 181
 182       team->nthreads = nthreads;
 183     }
 184
 185   team->work_share_chunk = 8;
 186 #ifdef HAVE_SYNC_BUILTINS
 187   team->single_count = 0;
 188 #endif
 189   team->work_shares_to_free = &team->work_shares[0];
 190   gomp_init_work_share (&team->work_shares[0], 0, nthreads);
 191   team->work_shares[0].next_alloc = NULL;
 192   team->work_share_list_free = NULL;
 193   team->work_share_list_alloc = &team->work_shares[1];
 194   for (i = 1; i < 7; i++)
 195     team->work_shares[i].next_free = &team->work_shares[i + 1];
 196   team->work_shares[i].next_free = NULL;
 197
 198   gomp_sem_init (&team->master_release, 0);
 199   team->ordered_release = (void *) &team->implicit_task[nthreads];
 200   team->ordered_release[0] = &team->master_release;
 201
 202   priority_queue_init (&team->task_queue);
 203   team->task_count = 0;
 204   team->task_queued_count = 0;
 205   team->task_running_count = 0;
 206   team->work_share_cancelled = 0;
 207   team->team_cancelled = 0;
 208
 209   return team;
 210 }
 211
 212
 213 /* Free a team data structure.  */
 214
 215 static void
 216 free_team (struct gomp_team *team)
 217 {
 218 #ifndef HAVE_SYNC_BUILTINS
 219   gomp_mutex_destroy (&team->work_share_list_free_lock);
 220 #endif
 221   gomp_barrier_destroy (&team->barrier);
 222   gomp_mutex_destroy (&team->task_lock);
 223   priority_queue_free (&team->task_queue);
 224   team_free (team);
 225 }
 226
 227 static void
 228 gomp_free_pool_helper (void *thread_pool)
 229 {
 230   struct gomp_thread *thr = gomp_thread ();
 231   struct gomp_thread_pool *pool
 232     = (struct gomp_thread_pool *) thread_pool;
 233   gomp_simple_barrier_wait_last (&pool->threads_dock);
 234   gomp_sem_destroy (&thr->release);
 235   thr->thread_pool = NULL;
 236   thr->task = NULL;
 237 #ifdef LIBGOMP_USE_PTHREADS
 238   pthread_detach (pthread_self ());
 239   pthread_exit (NULL);
 240 #elif defined(__nvptx__)
 241   asm ("exit;");
 242 #elif defined(__AMDGCN__)
 243   asm ("s_dcache_wb\n\t"
 244        "s_endpgm");
 245 #else
 246 #error gomp_free_pool_helper must terminate the thread
 247 #endif
 248 }
 249
 250 /* Free a thread pool and release its threads. */
 251
 252 void
 253 gomp_free_thread (void *arg __attribute__((unused)))
 254 {
 255   struct gomp_thread *thr = gomp_thread ();
 256   struct gomp_thread_pool *pool = thr->thread_pool;
 257   if (pool)
 258     {
 259       if (pool->threads_used > 0)
 260         {
 261           int i;
 262           for (i = 1; i < pool->threads_used; i++)
 263             {
 264               struct gomp_thread *nthr = pool->threads[i];
 265               nthr->fn = gomp_free_pool_helper;
 266               nthr->data = pool;
 267             }
 268           /* This barrier undocks threads docked on pool->threads_dock.  */
 269           gomp_simple_barrier_wait (&pool->threads_dock);
 270           /* And this waits till all threads have called gomp_barrier_wait_last
 271              in gomp_free_pool_helper.  */
 272           gomp_simple_barrier_wait (&pool->threads_dock);
 273           /* Now it is safe to destroy the barrier and free the pool.  */
 274           gomp_simple_barrier_destroy (&pool->threads_dock);
 275
 276 #ifdef HAVE_SYNC_BUILTINS
 277           __sync_fetch_and_add (&gomp_managed_threads,
 278                                 1L - pool->threads_used);
 279 #else
 280           gomp_mutex_lock (&gomp_managed_threads_lock);
 281           gomp_managed_threads -= pool->threads_used - 1L;
 282           gomp_mutex_unlock (&gomp_managed_threads_lock);
 283 #endif
 284         }
 285       if (pool->last_team)
 286         free_team (pool->last_team);
 287 #ifndef __nvptx__
 288       team_free (pool->threads);
 289       team_free (pool);
 290 #endif
 291       thr->thread_pool = NULL;
 292     }
 293   if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
 294     gomp_team_end ();
 295   if (thr->task != NULL)
 296     {
 297       struct gomp_task *task = thr->task;
 298       gomp_end_task ();
 299       free (task);
 300     }
 301 }
 302
 303 /* Launch a team.  */
 304
 305 #ifdef LIBGOMP_USE_PTHREADS
 306 void
 307 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
 308                  unsigned flags, struct gomp_team *team,
 309                  struct gomp_taskgroup *taskgroup)
 310 {
 311   struct gomp_thread_start_data *start_data;
 312   struct gomp_thread *thr, *nthr;
 313   struct gomp_task *task;
 314   struct gomp_task_icv *icv;
 315   bool nested;
 316   struct gomp_thread_pool *pool;
 317   unsigned i, n, old_threads_used = 0;
 318   pthread_attr_t thread_attr, *attr;
 319   unsigned long nthreads_var;
 320   char bind, bind_var;
 321   unsigned int s = 0, rest = 0, p = 0, k = 0;
 322   unsigned int affinity_count = 0;
 323   struct gomp_thread **affinity_thr = NULL;
 324   bool force_display = false;
 325
 326   thr = gomp_thread ();
 327   nested = thr->ts.level;
 328   pool = thr->thread_pool;
 329   task = thr->task;
 330   icv = task ? &task->icv : &gomp_global_icv;
 331   if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
 332     {
 333       gomp_init_affinity ();
 334       if (__builtin_expect (gomp_display_affinity_var, 0) && nthreads == 1)
 335         gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
 336                                       thr->place);
 337     }
 338
 339   /* Always save the previous state, even if this isn't a nested team.
 340      In particular, we should save any work share state from an outer
 341      orphaned work share construct.  */
 342   team->prev_ts = thr->ts;
 343
 344   thr->ts.team = team;
 345   thr->ts.team_id = 0;
 346   ++thr->ts.level;
 347   if (nthreads > 1)
 348     ++thr->ts.active_level;
 349   thr->ts.work_share = &team->work_shares[0];
 350   thr->ts.last_work_share = NULL;
 351 #ifdef HAVE_SYNC_BUILTINS
 352   thr->ts.single_count = 0;
 353 #endif
 354   thr->ts.static_trip = 0;
 355   thr->task = &team->implicit_task[0];
 356 #ifdef GOMP_NEEDS_THREAD_HANDLE
 357   thr->handle = pthread_self ();
 358 #endif
 359   nthreads_var = icv->nthreads_var;
 360   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
 361       && thr->ts.level < gomp_nthreads_var_list_len)
 362     nthreads_var = gomp_nthreads_var_list[thr->ts.level];
 363   bind_var = icv->bind_var;
 364   if (bind_var != omp_proc_bind_false && (flags & 7) != omp_proc_bind_false)
 365     bind_var = flags & 7;
 366   bind = bind_var;
 367   if (__builtin_expect (gomp_bind_var_list != NULL, 0)
 368       && thr->ts.level < gomp_bind_var_list_len)
 369     bind_var = gomp_bind_var_list[thr->ts.level];
 370   gomp_init_task (thr->task, task, icv);
 371   thr->task->taskgroup = taskgroup;
 372   team->implicit_task[0].icv.nthreads_var = nthreads_var;
 373   team->implicit_task[0].icv.bind_var = bind_var;
 374
 375   if (nthreads == 1)
 376     return;
 377
 378   i = 1;
 379
 380   if (__builtin_expect (gomp_places_list != NULL, 0))
 381     {
 382       /* Depending on chosen proc_bind model, set subpartition
 383          for the master thread and initialize helper variables
 384          P and optionally S, K and/or REST used by later place
 385          computation for each additional thread.  */
 386       p = thr->place - 1;
 387       switch (bind)
 388         {
 389         case omp_proc_bind_true:
 390         case omp_proc_bind_close:
 391           if (nthreads > thr->ts.place_partition_len)
 392             {
 393               /* T > P.  S threads will be placed in each place,
 394                  and the final REM threads placed one by one
 395                  into the already occupied places.  */
 396               s = nthreads / thr->ts.place_partition_len;
 397               rest = nthreads % thr->ts.place_partition_len;
 398             }
 399           else
 400             s = 1;
 401           k = 1;
 402           break;
 403         case omp_proc_bind_master:
 404           /* Each thread will be bound to master's place.  */
 405           break;
 406         case omp_proc_bind_spread:
 407           if (nthreads <= thr->ts.place_partition_len)
 408             {
 409               /* T <= P.  Each subpartition will have in between s
 410                  and s+1 places (subpartitions starting at or
 411                  after rest will have s places, earlier s+1 places),
 412                  each thread will be bound to the first place in
 413                  its subpartition (except for the master thread
 414                  that can be bound to another place in its
 415                  subpartition).  */
 416               s = thr->ts.place_partition_len / nthreads;
 417               rest = thr->ts.place_partition_len % nthreads;
 418               rest = (s + 1) * rest + thr->ts.place_partition_off;
 419               if (p < rest)
 420                 {
 421                   p -= (p - thr->ts.place_partition_off) % (s + 1);
 422                   thr->ts.place_partition_len = s + 1;
 423                 }
 424               else
 425                 {
 426                   p -= (p - rest) % s;
 427                   thr->ts.place_partition_len = s;
 428                 }
 429               thr->ts.place_partition_off = p;
 430             }
 431           else
 432             {
 433               /* T > P.  Each subpartition will have just a single
 434                  place and we'll place between s and s+1
 435                  threads into each subpartition.  */
 436               s = nthreads / thr->ts.place_partition_len;
 437               rest = nthreads % thr->ts.place_partition_len;
 438               thr->ts.place_partition_off = p;
 439               thr->ts.place_partition_len = 1;
 440               k = 1;
 441             }
 442           break;
 443         }
 444     }
 445   else
 446     bind = omp_proc_bind_false;
 447
 448   /* We only allow the reuse of idle threads for non-nested PARALLEL
 449      regions.  This appears to be implied by the semantics of
 450      threadprivate variables, but perhaps that's reading too much into
 451      things.  Certainly it does prevent any locking problems, since
 452      only the initial program thread will modify gomp_threads.  */
 453   if (!nested)
 454     {
 455       old_threads_used = pool->threads_used;
 456
 457       if (nthreads <= old_threads_used)
 458         n = nthreads;
 459       else if (old_threads_used == 0)
 460         {
 461           n = 0;
 462           gomp_simple_barrier_init (&pool->threads_dock, nthreads);
 463         }
 464       else
 465         {
 466           n = old_threads_used;
 467
 468           /* Increase the barrier threshold to make sure all new
 469              threads arrive before the team is released.  */
 470           gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
 471         }
 472
 473       /* Not true yet, but soon will be.  We're going to release all
 474          threads from the dock, and those that aren't part of the
 475          team will exit.  */
 476       pool->threads_used = nthreads;
 477
 478       /* If necessary, expand the size of the gomp_threads array.  It is
 479          expected that changes in the number of threads are rare, thus we
 480          make no effort to expand gomp_threads_size geometrically.  */
 481       if (nthreads >= pool->threads_size)
 482         {
 483           pool->threads_size = nthreads + 1;
 484           pool->threads
 485             = gomp_realloc (pool->threads,
 486                             pool->threads_size
 487                             * sizeof (struct gomp_thread *));
 488           /* Add current (master) thread to threads[].  */
 489           pool->threads[0] = thr;
 490         }
 491
 492       /* Release existing idle threads.  */
 493       for (; i < n; ++i)
 494         {
 495           unsigned int place_partition_off = thr->ts.place_partition_off;
 496           unsigned int place_partition_len = thr->ts.place_partition_len;
 497           unsigned int place = 0;
 498           if (__builtin_expect (gomp_places_list != NULL, 0))
 499             {
 500               switch (bind)
 501                 {
 502                 case omp_proc_bind_true:
 503                 case omp_proc_bind_close:
 504                   if (k == s)
 505                     {
 506                       ++p;
 507                       if (p == (team->prev_ts.place_partition_off
 508                                 + team->prev_ts.place_partition_len))
 509                         p = team->prev_ts.place_partition_off;
 510                       k = 1;
 511                       if (i == nthreads - rest)
 512                         s = 1;
 513                     }
 514                   else
 515                     ++k;
 516                   break;
 517                 case omp_proc_bind_master:
 518                   break;
 519                 case omp_proc_bind_spread:
 520                   if (k == 0)
 521                     {
 522                       /* T <= P.  */
 523                       if (p < rest)
 524                         p += s + 1;
 525                       else
 526                         p += s;
 527                       if (p == (team->prev_ts.place_partition_off
 528                                 + team->prev_ts.place_partition_len))
 529                         p = team->prev_ts.place_partition_off;
 530                       place_partition_off = p;
 531                       if (p < rest)
 532                         place_partition_len = s + 1;
 533                       else
 534                         place_partition_len = s;
 535                     }
 536                   else
 537                     {
 538                       /* T > P.  */
 539                       if (k == s)
 540                         {
 541                           ++p;
 542                           if (p == (team->prev_ts.place_partition_off
 543                                     + team->prev_ts.place_partition_len))
 544                             p = team->prev_ts.place_partition_off;
 545                           k = 1;
 546                           if (i == nthreads - rest)
 547                             s = 1;
 548                         }
 549                       else
 550                         ++k;
 551                       place_partition_off = p;
 552                       place_partition_len = 1;
 553                     }
 554                   break;
 555                 }
 556               if (affinity_thr != NULL
 557                   || (bind != omp_proc_bind_true
 558                       && pool->threads[i]->place != p + 1)
 559                   || pool->threads[i]->place <= place_partition_off
 560                   || pool->threads[i]->place > (place_partition_off
 561                                                 + place_partition_len))
 562                 {
 563                   unsigned int l;
 564                   force_display = true;
 565                   if (affinity_thr == NULL)
 566                     {
 567                       unsigned int j;
 568
 569                       if (team->prev_ts.place_partition_len > 64)
 570                         affinity_thr
 571                           = gomp_malloc (team->prev_ts.place_partition_len
 572                                          * sizeof (struct gomp_thread *));
 573                       else
 574                         affinity_thr
 575                           = gomp_alloca (team->prev_ts.place_partition_len
 576                                          * sizeof (struct gomp_thread *));
 577                       memset (affinity_thr, '\0',
 578                               team->prev_ts.place_partition_len
 579                               * sizeof (struct gomp_thread *));
 580                       for (j = i; j < old_threads_used; j++)
 581                         {
 582                           if (pool->threads[j]->place
 583                               > team->prev_ts.place_partition_off
 584                               && (pool->threads[j]->place
 585                                   <= (team->prev_ts.place_partition_off
 586                                       + team->prev_ts.place_partition_len)))
 587                             {
 588                               l = pool->threads[j]->place - 1
 589                                   - team->prev_ts.place_partition_off;
 590                               pool->threads[j]->data = affinity_thr[l];
 591                               affinity_thr[l] = pool->threads[j];
 592                             }
 593                           pool->threads[j] = NULL;
 594                         }
 595                       if (nthreads > old_threads_used)
 596                         memset (&pool->threads[old_threads_used],
 597                                 '\0', ((nthreads - old_threads_used)
 598                                        * sizeof (struct gomp_thread *)));
 599                       n = nthreads;
 600                       affinity_count = old_threads_used - i;
 601                     }
 602                   if (affinity_count == 0)
 603                     break;
 604                   l = p;
 605                   if (affinity_thr[l - team->prev_ts.place_partition_off]
 606                       == NULL)
 607                     {
 608                       if (bind != omp_proc_bind_true)
 609                         continue;
 610                       for (l = place_partition_off;
 611                            l < place_partition_off + place_partition_len;
 612                            l++)
 613                         if (affinity_thr[l - team->prev_ts.place_partition_off]
 614                             != NULL)
 615                           break;
 616                       if (l == place_partition_off + place_partition_len)
 617                         continue;
 618                     }
 619                   nthr = affinity_thr[l - team->prev_ts.place_partition_off];
 620                   affinity_thr[l - team->prev_ts.place_partition_off]
 621                     = (struct gomp_thread *) nthr->data;
 622                   affinity_count--;
 623                   pool->threads[i] = nthr;
 624                 }
 625               else
 626                 nthr = pool->threads[i];
 627               place = p + 1;
 628             }
 629           else
 630             nthr = pool->threads[i];
 631           nthr->ts.team = team;
 632           nthr->ts.work_share = &team->work_shares[0];
 633           nthr->ts.last_work_share = NULL;
 634           nthr->ts.team_id = i;
 635           nthr->ts.level = team->prev_ts.level + 1;
 636           nthr->ts.active_level = thr->ts.active_level;
 637           nthr->ts.place_partition_off = place_partition_off;
 638           nthr->ts.place_partition_len = place_partition_len;
 639           nthr->ts.def_allocator = thr->ts.def_allocator;
 640 #ifdef HAVE_SYNC_BUILTINS
 641           nthr->ts.single_count = 0;
 642 #endif
 643           nthr->ts.static_trip = 0;
 644           nthr->task = &team->implicit_task[i];
 645           nthr->place = place;
 646           gomp_init_task (nthr->task, task, icv);
 647           team->implicit_task[i].icv.nthreads_var = nthreads_var;
 648           team->implicit_task[i].icv.bind_var = bind_var;
 649           nthr->task->taskgroup = taskgroup;
 650           nthr->fn = fn;
 651           nthr->data = data;
 652           team->ordered_release[i] = &nthr->release;
 653         }
 654
 655       if (__builtin_expect (affinity_thr != NULL, 0))
 656         {
 657           /* If AFFINITY_THR is non-NULL just because we had to
 658              permute some threads in the pool, but we've managed
 659              to find exactly as many old threads as we'd find
 660              without affinity, we don't need to handle this
 661              specially anymore.  */
 662           if (nthreads <= old_threads_used
 663               ? (affinity_count == old_threads_used - nthreads)
 664               : (i == old_threads_used))
 665             {
 666               if (team->prev_ts.place_partition_len > 64)
 667                 free (affinity_thr);
 668               affinity_thr = NULL;
 669               affinity_count = 0;
 670             }
 671           else
 672             {
 673               i = 1;
 674               /* We are going to compute the places/subpartitions
 675                  again from the beginning.  So, we need to reinitialize
 676                  vars modified by the switch (bind) above inside
 677                  of the loop, to the state they had after the initial
 678                  switch (bind).  */
 679               switch (bind)
 680                 {
 681                 case omp_proc_bind_true:
 682                 case omp_proc_bind_close:
 683                   if (nthreads > thr->ts.place_partition_len)
 684                     /* T > P.  S has been changed, so needs
 685                        to be recomputed.  */
 686                     s = nthreads / thr->ts.place_partition_len;
 687                   k = 1;
 688                   p = thr->place - 1;
 689                   break;
 690                 case omp_proc_bind_master:
 691                   /* No vars have been changed.  */
 692                   break;
 693                 case omp_proc_bind_spread:
 694                   p = thr->ts.place_partition_off;
 695                   if (k != 0)
 696                     {
 697                       /* T > P.  */
 698                       s = nthreads / team->prev_ts.place_partition_len;
 699                       k = 1;
 700                     }
 701                   break;
 702                 }
 703
 704               /* Increase the barrier threshold to make sure all new
 705                  threads and all the threads we're going to let die
 706                  arrive before the team is released.  */
 707               if (affinity_count)
 708                 gomp_simple_barrier_reinit (&pool->threads_dock,
 709                                             nthreads + affinity_count);
 710             }
 711         }
 712
 713       if (i == nthreads)
 714         goto do_release;
 715
 716     }
 717
 718   if (__builtin_expect (nthreads + affinity_count > old_threads_used, 0))
 719     {
 720       long diff = (long) (nthreads + affinity_count) - (long) old_threads_used;
 721
 722       if (old_threads_used == 0)
 723         --diff;
 724
 725 #ifdef HAVE_SYNC_BUILTINS
 726       __sync_fetch_and_add (&gomp_managed_threads, diff);
 727 #else
 728       gomp_mutex_lock (&gomp_managed_threads_lock);
 729       gomp_managed_threads += diff;
 730       gomp_mutex_unlock (&gomp_managed_threads_lock);
 731 #endif
 732     }
 733
 734   attr = &gomp_thread_attr;
 735   if (__builtin_expect (gomp_places_list != NULL, 0))
 736     {
 737       size_t stacksize;
 738       pthread_attr_init (&thread_attr);
 739       if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize))
 740         pthread_attr_setstacksize (&thread_attr, stacksize);
 741       attr = &thread_attr;
 742     }
 743
 744   start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
 745                             * (nthreads - i));
 746
 747   /* Launch new threads.  */
 748   for (; i < nthreads; ++i)
 749     {
 750       int err;
 751
 752       start_data->ts.place_partition_off = thr->ts.place_partition_off;
 753       start_data->ts.place_partition_len = thr->ts.place_partition_len;
 754       start_data->place = 0;
 755       if (__builtin_expect (gomp_places_list != NULL, 0))
 756         {
 757           switch (bind)
 758             {
 759             case omp_proc_bind_true:
 760             case omp_proc_bind_close:
 761               if (k == s)
 762                 {
 763                   ++p;
 764                   if (p == (team->prev_ts.place_partition_off
 765                             + team->prev_ts.place_partition_len))
 766                     p = team->prev_ts.place_partition_off;
 767                   k = 1;
 768                   if (i == nthreads - rest)
 769                     s = 1;
 770                 }
 771               else
 772                 ++k;
 773               break;
 774             case omp_proc_bind_master:
 775               break;
 776             case omp_proc_bind_spread:
 777               if (k == 0)
 778                 {
 779                   /* T <= P.  */
 780                   if (p < rest)
 781                     p += s + 1;
 782                   else
 783                     p += s;
 784                   if (p == (team->prev_ts.place_partition_off
 785                             + team->prev_ts.place_partition_len))
 786                     p = team->prev_ts.place_partition_off;
 787                   start_data->ts.place_partition_off = p;
 788                   if (p < rest)
 789                     start_data->ts.place_partition_len = s + 1;
 790                   else
 791                     start_data->ts.place_partition_len = s;
 792                 }
 793               else
 794                 {
 795                   /* T > P.  */
 796                   if (k == s)
 797                     {
 798                       ++p;
 799                       if (p == (team->prev_ts.place_partition_off
 800                                 + team->prev_ts.place_partition_len))
 801                         p = team->prev_ts.place_partition_off;
 802                       k = 1;
 803                       if (i == nthreads - rest)
 804                         s = 1;
 805                     }
 806                   else
 807                     ++k;
 808                   start_data->ts.place_partition_off = p;
 809                   start_data->ts.place_partition_len = 1;
 810                 }
 811               break;
 812             }
 813           start_data->place = p + 1;
 814           if (affinity_thr != NULL && pool->threads[i] != NULL)
 815             continue;
 816           gomp_init_thread_affinity (attr, p);
 817         }
 818
 819       start_data->fn = fn;
 820       start_data->fn_data = data;
 821       start_data->ts.team = team;
 822       start_data->ts.work_share = &team->work_shares[0];
 823       start_data->ts.last_work_share = NULL;
 824       start_data->ts.team_id = i;
 825       start_data->ts.level = team->prev_ts.level + 1;
 826       start_data->ts.active_level = thr->ts.active_level;
 827       start_data->ts.def_allocator = thr->ts.def_allocator;
 828 #ifdef HAVE_SYNC_BUILTINS
 829       start_data->ts.single_count = 0;
 830 #endif
 831       start_data->ts.static_trip = 0;
 832       start_data->task = &team->implicit_task[i];
 833       gomp_init_task (start_data->task, task, icv);
 834       team->implicit_task[i].icv.nthreads_var = nthreads_var;
 835       team->implicit_task[i].icv.bind_var = bind_var;
 836       start_data->task->taskgroup = taskgroup;
 837       start_data->thread_pool = pool;
 838       start_data->nested = nested;
 839
 840       attr = gomp_adjust_thread_attr (attr, &thread_attr);
 841       err = pthread_create (&start_data->handle, attr, gomp_thread_start,
 842                             start_data);
 843       start_data++;
 844       if (err != 0)
 845         gomp_fatal ("Thread creation failed: %s", strerror (err));
 846     }
 847
 848   if (__builtin_expect (attr == &thread_attr, 0))
 849     pthread_attr_destroy (&thread_attr);
 850
 851  do_release:
 852   if (nested)
 853     gomp_barrier_wait (&team->barrier);
 854   else
 855     gomp_simple_barrier_wait (&pool->threads_dock);
 856
 857   /* Decrease the barrier threshold to match the number of threads
 858      that should arrive back at the end of this team.  The extra
 859      threads should be exiting.  Note that we arrange for this test
 860      to never be true for nested teams.  If AFFINITY_COUNT is non-zero,
 861      the barrier as well as gomp_managed_threads was temporarily
 862      set to NTHREADS + AFFINITY_COUNT.  For NTHREADS < OLD_THREADS_COUNT,
 863      AFFINITY_COUNT if non-zero will be always at least
 864      OLD_THREADS_COUNT - NTHREADS.  */
 865   if (__builtin_expect (nthreads < old_threads_used, 0)
 866       || __builtin_expect (affinity_count, 0))
 867     {
 868       long diff = (long) nthreads - (long) old_threads_used;
 869
 870       if (affinity_count)
 871         diff = -affinity_count;
 872
 873       gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
 874
 875 #ifdef HAVE_SYNC_BUILTINS
 876       __sync_fetch_and_add (&gomp_managed_threads, diff);
 877 #else
 878       gomp_mutex_lock (&gomp_managed_threads_lock);
 879       gomp_managed_threads += diff;
 880       gomp_mutex_unlock (&gomp_managed_threads_lock);
 881 #endif
 882     }
 883   if (__builtin_expect (gomp_display_affinity_var, 0))
 884     {
 885       if (nested
 886           || nthreads != old_threads_used
 887           || force_display)
 888         {
 889           gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
 890                                         thr->place);
 891           if (nested)
 892             {
 893               start_data -= nthreads - 1;
 894               for (i = 1; i < nthreads; ++i)
 895                 {
 896                   gomp_display_affinity_thread (
 897 #ifdef LIBGOMP_USE_PTHREADS
 898                                                 start_data->handle,
 899 #else
 900                                                 gomp_thread_self (),
 901 #endif
 902                                                 &start_data->ts,
 903                                                 start_data->place);
 904                   start_data++;
 905                 }
 906             }
 907           else
 908             {
 909               for (i = 1; i < nthreads; ++i)
 910                 {
 911                   gomp_thread_handle handle
 912                     = gomp_thread_to_pthread_t (pool->threads[i]);
 913                   gomp_display_affinity_thread (handle, &pool->threads[i]->ts,
 914                                                 pool->threads[i]->place);
 915                 }
 916             }
 917         }
 918     }
 919   if (__builtin_expect (affinity_thr != NULL, 0)
 920       && team->prev_ts.place_partition_len > 64)
 921     free (affinity_thr);
 922 }
 923 #endif
 924
 925
 926 /* Terminate the current team.  This is only to be called by the master
 927    thread.  We assume that we must wait for the other threads.  */
 928
 929 void
 930 gomp_team_end (void)
 931 {
 932   struct gomp_thread *thr = gomp_thread ();
 933   struct gomp_team *team = thr->ts.team;
 934
 935   /* This barrier handles all pending explicit threads.
 936      As #pragma omp cancel parallel might get awaited count in
 937      team->barrier in a inconsistent state, we need to use a different
 938      counter here.  */
 939   gomp_team_barrier_wait_final (&team->barrier);
 940   if (__builtin_expect (team->team_cancelled, 0))
 941     {
 942       struct gomp_work_share *ws = team->work_shares_to_free;
 943       do
 944         {
 945           struct gomp_work_share *next_ws = gomp_ptrlock_get (&ws->next_ws);
 946           if (next_ws == NULL)
 947             gomp_ptrlock_set (&ws->next_ws, ws);
 948           gomp_fini_work_share (ws);
 949           ws = next_ws;
 950         }
 951       while (ws != NULL);
 952     }
 953   else
 954     gomp_fini_work_share (thr->ts.work_share);
 955
 956   gomp_end_task ();
 957   thr->ts = team->prev_ts;
 958
 959   if (__builtin_expect (thr->ts.level != 0, 0))
 960     {
 961 #ifdef HAVE_SYNC_BUILTINS
 962       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
 963 #else
 964       gomp_mutex_lock (&gomp_managed_threads_lock);
 965       gomp_managed_threads -= team->nthreads - 1L;
 966       gomp_mutex_unlock (&gomp_managed_threads_lock);
 967 #endif
 968       /* This barrier has gomp_barrier_wait_last counterparts
 969          and ensures the team can be safely destroyed.  */
 970       gomp_barrier_wait (&team->barrier);
 971     }
 972
 973   if (__builtin_expect (team->work_shares[0].next_alloc != NULL, 0))
 974     {
 975       struct gomp_work_share *ws = team->work_shares[0].next_alloc;
 976       do
 977         {
 978           struct gomp_work_share *next_ws = ws->next_alloc;
 979           free (ws);
 980           ws = next_ws;
 981         }
 982       while (ws != NULL);
 983     }
 984   gomp_sem_destroy (&team->master_release);
 985
 986   if (__builtin_expect (thr->ts.team != NULL, 0)
 987       || __builtin_expect (team->nthreads == 1, 0))
 988     free_team (team);
 989   else
 990     {
 991       struct gomp_thread_pool *pool = thr->thread_pool;
 992       if (pool->last_team)
 993         free_team (pool->last_team);
 994       pool->last_team = team;
 995       gomp_release_thread_pool (pool);
 996     }
 997 }
 998
 999 #ifdef LIBGOMP_USE_PTHREADS
1000
1001 /* Constructors for this file.  */
1002
1003 static void __attribute__((constructor))
1004 initialize_team (void)
1005 {
1006 #if !defined HAVE_TLS && !defined USE_EMUTLS
1007   static struct gomp_thread initial_thread_tls_data;
1008
1009   pthread_key_create (&gomp_tls_key, NULL);
1010   pthread_setspecific (gomp_tls_key, &initial_thread_tls_data);
1011 #endif
1012
1013   if (pthread_key_create (&gomp_thread_destructor, gomp_free_thread) != 0)
1014     gomp_fatal ("could not create thread pool destructor.");
1015 }
1016
1017 static void __attribute__((destructor))
1018 team_destructor (void)
1019 {
1020   /* Without this dlclose on libgomp could lead to subsequent
1021      crashes.  */
1022   pthread_key_delete (gomp_thread_destructor);
1023 }
1024
1025 /* Similar to gomp_free_pool_helper, but don't detach itself,
1026    gomp_pause_host will pthread_join those threads.  */
1027
1028 static void
1029 gomp_pause_pool_helper (void *thread_pool)
1030 {
1031   struct gomp_thread *thr = gomp_thread ();
1032   struct gomp_thread_pool *pool
1033     = (struct gomp_thread_pool *) thread_pool;
1034   gomp_simple_barrier_wait_last (&pool->threads_dock);
1035   gomp_sem_destroy (&thr->release);
1036   thr->thread_pool = NULL;
1037   thr->task = NULL;
1038   pthread_exit (NULL);
1039 }
1040
1041 /* Free a thread pool and release its threads.  Return non-zero on
1042    failure.  */
1043
1044 int
1045 gomp_pause_host (void)
1046 {
1047   struct gomp_thread *thr = gomp_thread ();
1048   struct gomp_thread_pool *pool = thr->thread_pool;
1049   if (thr->ts.level)
1050     return -1;
1051   if (pool)
1052     {
1053       if (pool->threads_used > 0)
1054         {
1055           int i;
1056           pthread_t *thrs
1057             = gomp_alloca (sizeof (pthread_t) * pool->threads_used);
1058           for (i = 1; i < pool->threads_used; i++)
1059             {
1060               struct gomp_thread *nthr = pool->threads[i];
1061               nthr->fn = gomp_pause_pool_helper;
1062               nthr->data = pool;
1063               thrs[i] = gomp_thread_to_pthread_t (nthr);
1064             }
1065           /* This barrier undocks threads docked on pool->threads_dock.  */
1066           gomp_simple_barrier_wait (&pool->threads_dock);
1067           /* And this waits till all threads have called gomp_barrier_wait_last
1068              in gomp_pause_pool_helper.  */
1069           gomp_simple_barrier_wait (&pool->threads_dock);
1070           /* Now it is safe to destroy the barrier and free the pool.  */
1071           gomp_simple_barrier_destroy (&pool->threads_dock);
1072
1073 #ifdef HAVE_SYNC_BUILTINS
1074           __sync_fetch_and_add (&gomp_managed_threads,
1075                                 1L - pool->threads_used);
1076 #else
1077           gomp_mutex_lock (&gomp_managed_threads_lock);
1078           gomp_managed_threads -= pool->threads_used - 1L;
1079           gomp_mutex_unlock (&gomp_managed_threads_lock);
1080 #endif
1081           for (i = 1; i < pool->threads_used; i++)
1082             pthread_join (thrs[i], NULL);
1083         }
1084       if (pool->last_team)
1085         free_team (pool->last_team);
1086 #ifndef __nvptx__
1087       team_free (pool->threads);
1088       team_free (pool);
1089 #endif
1090       thr->thread_pool = NULL;
1091     }
1092   return 0;
1093 }
1094 #endif
1095
1096 struct gomp_task_icv *
1097 gomp_new_icv (void)
1098 {
1099   struct gomp_thread *thr = gomp_thread ();
1100   struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
1101   gomp_init_task (task, NULL, &gomp_global_icv);
1102   thr->task = task;
1103 #ifdef LIBGOMP_USE_PTHREADS
1104   pthread_setspecific (gomp_thread_destructor, thr);
1105 #endif
1106   return &task->icv;
1107 }