From 93fc47746815ea9dac413322fcade2931f757e7f Mon Sep 17 00:00:00 2001 From: Jonathan Wakely Date: Thu, 12 Nov 2020 21:25:14 +0000 Subject: [PATCH] libstdc++: Optimise std::future::wait_for and fix futex polling To poll a std::future to see if it's ready you have to call one of the timed waiting functions. The most obvious way is wait_for(0s) but this was previously very inefficient because it would turn the relative timeout to an absolute one by calling system_clock::now(). When the relative timeout is zero (or less) we're obviously going to get a time that has already passed, but the overhead of obtaining the current time can be dozens of microseconds. The alternative is to call wait_until with an absolute timeout that is in the past. If you know the clock's epoch is in the past you can use a default constructed time_point. Alternatively, using some_clock::time_point::min() gives the earliest time point supported by the clock, which should be safe to assume is in the past. However, using a futex wait with an absolute timeout before the UNIX epoch fails and sets errno=EINVAL. The new code using futex waits with absolute timeouts was not checking for this case, which could result in hangs (or killing the process if the libray is built with assertions enabled). This patch checks for times before the epoch before attempting to wait on a futex with an absolute timeout, which fixes the hangs or crashes. It also makes it very fast to poll using an absolute timeout before the epoch (because we skip the futex syscall). It also makes future::wait_for avoid waiting at all when the relative timeout is zero or less, to avoid the unnecessary overhead of getting the current time. This makes polling with wait_for(0s) take only a few cycles instead of dozens of milliseconds. libstdc++-v3/ChangeLog: * include/std/future (future::wait_for): Do not wait for durations less than or equal to zero. * src/c++11/futex.cc (_M_futex_wait_until) (_M_futex_wait_until_steady): Do not wait for timeouts before the epoch. * testsuite/30_threads/future/members/poll.cc: New test. --- libstdc++-v3/include/std/future | 14 ++- libstdc++-v3/src/c++11/futex.cc | 9 ++ .../30_threads/future/members/poll.cc | 103 ++++++++++++++++++ 3 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 libstdc++-v3/testsuite/30_threads/future/members/poll.cc diff --git a/libstdc++-v3/include/std/future b/libstdc++-v3/include/std/future index 5d948018c75..f7617cac8e9 100644 --- a/libstdc++-v3/include/std/future +++ b/libstdc++-v3/include/std/future @@ -345,10 +345,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // to synchronize with the thread that made it ready. if (_M_status._M_load(memory_order_acquire) == _Status::__ready) return future_status::ready; + if (_M_is_deferred_future()) return future_status::deferred; - if (_M_status._M_load_when_equal_for(_Status::__ready, - memory_order_acquire, __rel)) + + // Don't wait unless the relative time is greater than zero. + if (__rel > __rel.zero() + && _M_status._M_load_when_equal_for(_Status::__ready, + memory_order_acquire, + __rel)) { // _GLIBCXX_RESOLVE_LIB_DEFECTS // 2100. timed waiting functions must also join @@ -377,10 +382,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // to synchronize with the thread that made it ready. if (_M_status._M_load(memory_order_acquire) == _Status::__ready) return future_status::ready; + if (_M_is_deferred_future()) return future_status::deferred; + if (_M_status._M_load_when_equal_until(_Status::__ready, - memory_order_acquire, __abs)) + memory_order_acquire, + __abs)) { // _GLIBCXX_RESOLVE_LIB_DEFECTS // 2100. timed waiting functions must also join diff --git a/libstdc++-v3/src/c++11/futex.cc b/libstdc++-v3/src/c++11/futex.cc index 0331bd6df64..57f7dfe87e9 100644 --- a/libstdc++-v3/src/c++11/futex.cc +++ b/libstdc++-v3/src/c++11/futex.cc @@ -78,6 +78,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION struct timespec rt; rt.tv_sec = __s.count(); rt.tv_nsec = __ns.count(); + + // futex sets errno=EINVAL for absolute timeouts before the epoch. + if (__builtin_expect(rt.tv_sec < 0, false)) + return false; + if (syscall (SYS_futex, __addr, futex_wait_bitset_op | futex_clock_realtime_flag, __val, &rt, nullptr, futex_bitset_match_any) == -1) @@ -151,6 +156,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION rt.tv_sec = __s.count(); rt.tv_nsec = __ns.count(); + // futex sets errno=EINVAL for absolute timeouts before the epoch. + if (__builtin_expect(rt.tv_sec < 0, false)) + return false; + if (syscall (SYS_futex, __addr, futex_wait_bitset_op | futex_clock_monotonic_flag, __val, &rt, nullptr, futex_bitset_match_any) == -1) diff --git a/libstdc++-v3/testsuite/30_threads/future/members/poll.cc b/libstdc++-v3/testsuite/30_threads/future/members/poll.cc new file mode 100644 index 00000000000..54580579d3a --- /dev/null +++ b/libstdc++-v3/testsuite/30_threads/future/members/poll.cc @@ -0,0 +1,103 @@ +// Copyright (C) 2020 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-options "-O3" } +// { dg-do run { target c++11 } } + +#include +#include +#include +#include + +const int iterations = 200; + +using namespace std; + +template +double +print(const char* desc, Duration dur) +{ + auto ns = chrono::duration_cast(dur).count(); + double d = double(ns) / iterations; + cout << desc << ": " << ns << "ns for " << iterations + << " calls, avg " << d << "ns per call\n"; + return d; +} + +int main() +{ + promise p; + future f = p.get_future(); + + auto start = chrono::high_resolution_clock::now(); + for(int i = 0; i < iterations; i++) + f.wait_for(chrono::seconds(0)); + auto stop = chrono::high_resolution_clock::now(); + double wait_for_0 = print("wait_for(0s)", stop - start); + + start = chrono::high_resolution_clock::now(); + for(int i = 0; i < iterations; i++) + f.wait_until(chrono::system_clock::time_point()); + stop = chrono::high_resolution_clock::now(); + double wait_until_sys_epoch __attribute__((unused)) + = print("wait_until(system_clock epoch)", stop - start); + + start = chrono::high_resolution_clock::now(); + for(int i = 0; i < iterations; i++) + f.wait_until(chrono::steady_clock::time_point()); + stop = chrono::high_resolution_clock::now(); + double wait_until_steady_epoch __attribute__((unused)) + = print("wait_until(steady_clock epoch", stop - start); + + start = chrono::high_resolution_clock::now(); + for(int i = 0; i < iterations; i++) + f.wait_until(chrono::system_clock::time_point::min()); + stop = chrono::high_resolution_clock::now(); + double wait_until_sys_min __attribute__((unused)) + = print("wait_until(system_clock minimum)", stop - start); + + start = chrono::high_resolution_clock::now(); + for(int i = 0; i < iterations; i++) + f.wait_until(chrono::steady_clock::time_point::min()); + stop = chrono::high_resolution_clock::now(); + double wait_until_steady_min __attribute__((unused)) + = print("wait_until(steady_clock minimum)", stop - start); + + p.set_value(1); + + start = chrono::high_resolution_clock::now(); + for(int i = 0; i < iterations; i++) + f.wait_for(chrono::seconds(0)); + stop = chrono::high_resolution_clock::now(); + double ready = print("wait_for when ready", stop - start); + + // polling before ready with wait_for(0s) should be almost as fast as + // after the result is ready. + VERIFY( wait_for_0 < (ready * 10) ); + + // The following two tests fail with GCC 11, see + // https://gcc.gnu.org/pipermail/libstdc++/2020-November/051422.html +#if 0 + // polling before ready using wait_until(epoch) should not be terribly slow. + VERIFY( wait_until_sys_epoch < (ready * 100) ); + VERIFY( wait_until_steady_epoch < (ready * 100) ); +#endif + + // polling before ready using wait_until(min) should not be terribly slow. + VERIFY( wait_until_sys_min < (ready * 100) ); + VERIFY( wait_until_steady_min < (ready * 100) ); +} -- 2.30.2