From 238fc3441cecc5bdf982009c168d9f5b9085e8de Mon Sep 17 00:00:00 2001 From: Ian Lance Taylor Date: Fri, 14 Oct 2016 13:36:35 +0000 Subject: [PATCH] runtime: copy cpuprof code from Go 1.7 runtime This replaces runtime/cpuprof.goc with go/runtime/cpuprof.go and adjusts the supporting code in runtime/proc.c. This adds another case where the compiler needs to avoid heap allocation in the runtime package: when evaluating a method expression into a closure. Implementing this required moving the relevant code from do_get_backend to do_flatten, so that I could easily add a temporary variable. Doing that let me get rid of Bound_method_expression::do_lower. Reviewed-on: https://go-review.googlesource.com/31050 From-SVN: r241163 --- gcc/go/gofrontend/MERGE | 2 +- gcc/go/gofrontend/expressions.cc | 113 ++++---- gcc/go/gofrontend/expressions.h | 5 +- libgo/Makefile.am | 1 - libgo/Makefile.in | 8 +- libgo/go/runtime/cpuprof.go | 453 +++++++++++++++++++++++++++++++ libgo/go/runtime/stubs.go | 13 + libgo/runtime/cpuprof.goc | 442 ------------------------------ libgo/runtime/go-signal.c | 2 + libgo/runtime/malloc.h | 3 +- libgo/runtime/proc.c | 58 ++-- libgo/runtime/runtime.h | 5 +- libgo/runtime/runtime1.goc | 4 - 13 files changed, 568 insertions(+), 541 deletions(-) create mode 100644 libgo/go/runtime/cpuprof.go delete mode 100644 libgo/runtime/cpuprof.goc diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE index 6e689f464a9..769defcc221 100644 --- a/gcc/go/gofrontend/MERGE +++ b/gcc/go/gofrontend/MERGE @@ -1,4 +1,4 @@ -e3913d96fb024b916c87a4dc01f413523467ead9 +5f043fc2bf0f92a84a1f7da57acd79a61c9d2592 The first line of this file holds the git revision number of the last merge done from the gofrontend repository. diff --git a/gcc/go/gofrontend/expressions.cc b/gcc/go/gofrontend/expressions.cc index 40c8a4e2df7..daa1c92cce6 100644 --- a/gcc/go/gofrontend/expressions.cc +++ b/gcc/go/gofrontend/expressions.cc @@ -3623,6 +3623,8 @@ Unsafe_type_conversion_expression::do_get_backend(Translate_context* context) || et->map_type() != NULL || et->channel_type() != NULL || et->is_nil_type()); + else if (t->function_type() != NULL) + go_assert(et->points_to() != NULL); else go_unreachable(); @@ -6482,34 +6484,6 @@ Bound_method_expression::do_traverse(Traverse* traverse) return Expression::traverse(&this->expr_, traverse); } -// Lower the expression. If this is a method value rather than being -// called, and the method is accessed via a pointer, we may need to -// add nil checks. Introduce a temporary variable so that those nil -// checks do not cause multiple evaluation. - -Expression* -Bound_method_expression::do_lower(Gogo*, Named_object*, - Statement_inserter* inserter, int) -{ - // For simplicity we use a temporary for every call to an embedded - // method, even though some of them might be pure value methods and - // not require a temporary. - if (this->expr_->var_expression() == NULL - && this->expr_->temporary_reference_expression() == NULL - && this->expr_->set_and_use_temporary_expression() == NULL - && (this->method_->field_indexes() != NULL - || (this->method_->is_value_method() - && this->expr_->type()->points_to() != NULL))) - { - Temporary_statement* temp = - Statement::make_temporary(this->expr_->type(), NULL, this->location()); - inserter->insert(temp); - this->expr_ = Expression::make_set_and_use_temporary(temp, this->expr_, - this->location()); - } - return this; -} - // Return the type of a bound method expression. The type of this // object is simply the type of the method with no receiver. @@ -6724,32 +6698,43 @@ bme_check_nil(const Method::Field_indexes* field_indexes, Location loc, return cond; } -// Get the backend representation for a method value. +// Flatten a method value into a struct with nil checks. We can't do +// this in the lowering phase, because if the method value is called +// directly we don't need a thunk. That case will have been handled +// by Call_expression::do_lower, so if we get here then we do need a +// thunk. -Bexpression* -Bound_method_expression::do_get_backend(Translate_context* context) +Expression* +Bound_method_expression::do_flatten(Gogo* gogo, Named_object*, + Statement_inserter* inserter) { - Named_object* thunk = Bound_method_expression::create_thunk(context->gogo(), + Location loc = this->location(); + + Named_object* thunk = Bound_method_expression::create_thunk(gogo, this->method_, this->function_); if (thunk->is_erroneous()) { go_assert(saw_errors()); - return context->backend()->error_expression(); + return Expression::make_error(loc); } - // FIXME: We should lower this earlier, but we can't lower it in the - // lowering pass because at that point we don't know whether we need - // to create the thunk or not. If the expression is called, we - // don't need the thunk. - - Location loc = this->location(); + // Force the expression into a variable. This is only necessary if + // we are going to do nil checks below, but it's easy enough to + // always do it. + Expression* expr = this->expr_; + if (!expr->is_variable()) + { + Temporary_statement* etemp = Statement::make_temporary(NULL, expr, loc); + inserter->insert(etemp); + expr = Expression::make_temporary_reference(etemp, loc); + } // If the method expects a value, and we have a pointer, we need to // dereference the pointer. Named_object* fn = this->method_->named_object(); - Function_type* fntype; + Function_type *fntype; if (fn->is_function()) fntype = fn->func_value()->type(); else if (fn->is_function_declaration()) @@ -6757,7 +6742,7 @@ Bound_method_expression::do_get_backend(Translate_context* context) else go_unreachable(); - Expression* val = this->expr_; + Expression* val = expr; if (fntype->receiver()->type()->points_to() == NULL && val->type()->points_to() != NULL) val = Expression::make_unary(OPERATOR_MULT, val, loc); @@ -6781,17 +6766,28 @@ Bound_method_expression::do_get_backend(Translate_context* context) vals->push_back(val); Expression* ret = Expression::make_struct_composite_literal(st, vals, loc); - ret = Expression::make_heap_expression(ret, loc); - // See whether the expression or any embedded pointers are nil. + if (!gogo->compiling_runtime() || gogo->package_name() != "runtime") + ret = Expression::make_heap_expression(ret, loc); + else + { + // When compiling the runtime, method closures do not escape. + // When escape analysis becomes the default, and applies to + // method closures, this should be changed to make it an error + // if a method closure escapes. + Temporary_statement* ctemp = Statement::make_temporary(st, ret, loc); + inserter->insert(ctemp); + ret = Expression::make_temporary_reference(ctemp, loc); + ret = Expression::make_unary(OPERATOR_AND, ret, loc); + ret->unary_expression()->set_does_not_escape(); + } + + // If necessary, check whether the expression or any embedded + // pointers are nil. Expression* nil_check = NULL; - Expression* expr = this->expr_; if (this->method_->field_indexes() != NULL) { - // Note that we are evaluating this->expr_ twice, but that is OK - // because in the lowering pass we forced it into a temporary - // variable. Expression* ref = expr; nil_check = bme_check_nil(this->method_->field_indexes(), loc, &ref); expr = ref; @@ -6808,19 +6804,20 @@ Bound_method_expression::do_get_backend(Translate_context* context) nil_check = Expression::make_binary(OPERATOR_OROR, nil_check, n, loc); } - Bexpression* bme = ret->get_backend(context); if (nil_check != NULL) { - Gogo* gogo = context->gogo(); - Bexpression* crash = - gogo->runtime_error(RUNTIME_ERROR_NIL_DEREFERENCE, - loc)->get_backend(context); - Btype* btype = ret->type()->get_backend(gogo); - Bexpression* bcheck = nil_check->get_backend(context); - bme = gogo->backend()->conditional_expression(btype, bcheck, crash, - bme, loc); - } - return bme; + Expression* crash = gogo->runtime_error(RUNTIME_ERROR_NIL_DEREFERENCE, + loc); + // Fix the type of the conditional expression by pretending to + // evaluate to RET either way through the conditional. + crash = Expression::make_compound(crash, ret, loc); + ret = Expression::make_conditional(nil_check, crash, ret, loc); + } + + // RET is a pointer to a struct, but we want a function type. + ret = Expression::make_unsafe_cast(this->type(), ret, loc); + + return ret; } // Dump ast representation of a bound method expression. diff --git a/gcc/go/gofrontend/expressions.h b/gcc/go/gofrontend/expressions.h index 11614c3c3e6..0d00f458c38 100644 --- a/gcc/go/gofrontend/expressions.h +++ b/gcc/go/gofrontend/expressions.h @@ -2888,7 +2888,7 @@ class Bound_method_expression : public Expression do_traverse(Traverse*); Expression* - do_lower(Gogo*, Named_object*, Statement_inserter*, int); + do_flatten(Gogo*, Named_object*, Statement_inserter*); Type* do_type(); @@ -2907,7 +2907,8 @@ class Bound_method_expression : public Expression } Bexpression* - do_get_backend(Translate_context*); + do_get_backend(Translate_context*) + { go_unreachable(); } void do_dump_expression(Ast_dump_context*) const; diff --git a/libgo/Makefile.am b/libgo/Makefile.am index be5b0cac19c..dee6fbcc25c 100644 --- a/libgo/Makefile.am +++ b/libgo/Makefile.am @@ -512,7 +512,6 @@ runtime_files = \ $(runtime_thread_files) \ runtime/yield.c \ $(rtems_task_variable_add_file) \ - cpuprof.c \ go-iface.c \ lfstack.c \ malloc.c \ diff --git a/libgo/Makefile.in b/libgo/Makefile.in index 6797a349bad..c811312f21d 100644 --- a/libgo/Makefile.in +++ b/libgo/Makefile.in @@ -261,9 +261,9 @@ am__objects_6 = go-append.lo go-assert.lo go-assert-interface.lo \ mcentral.lo $(am__objects_1) mfixalloc.lo mgc0.lo mheap.lo \ msize.lo $(am__objects_2) panic.lo parfor.lo print.lo proc.lo \ runtime.lo signal_unix.lo thread.lo $(am__objects_3) yield.lo \ - $(am__objects_4) cpuprof.lo go-iface.lo lfstack.lo malloc.lo \ - mprof.lo netpoll.lo rdebug.lo reflect.lo runtime1.lo \ - sigqueue.lo time.lo $(am__objects_5) + $(am__objects_4) go-iface.lo lfstack.lo malloc.lo mprof.lo \ + netpoll.lo rdebug.lo reflect.lo runtime1.lo sigqueue.lo \ + time.lo $(am__objects_5) am_libgo_llgo_la_OBJECTS = $(am__objects_6) libgo_llgo_la_OBJECTS = $(am_libgo_llgo_la_OBJECTS) libgo_llgo_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ @@ -911,7 +911,6 @@ runtime_files = \ $(runtime_thread_files) \ runtime/yield.c \ $(rtems_task_variable_add_file) \ - cpuprof.c \ go-iface.c \ lfstack.c \ malloc.c \ @@ -1547,7 +1546,6 @@ mostlyclean-compile: distclean-compile: -rm -f *.tab.c -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cpuprof.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env_posix.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-bsd.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-irix.Plo@am__quote@ diff --git a/libgo/go/runtime/cpuprof.go b/libgo/go/runtime/cpuprof.go new file mode 100644 index 00000000000..873276f3639 --- /dev/null +++ b/libgo/go/runtime/cpuprof.go @@ -0,0 +1,453 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// CPU profiling. +// Based on algorithms and data structures used in +// http://code.google.com/p/google-perftools/. +// +// The main difference between this code and the google-perftools +// code is that this code is written to allow copying the profile data +// to an arbitrary io.Writer, while the google-perftools code always +// writes to an operating system file. +// +// The signal handler for the profiling clock tick adds a new stack trace +// to a hash table tracking counts for recent traces. Most clock ticks +// hit in the cache. In the event of a cache miss, an entry must be +// evicted from the hash table, copied to a log that will eventually be +// written as profile data. The google-perftools code flushed the +// log itself during the signal handler. This code cannot do that, because +// the io.Writer might block or need system calls or locks that are not +// safe to use from within the signal handler. Instead, we split the log +// into two halves and let the signal handler fill one half while a goroutine +// is writing out the other half. When the signal handler fills its half, it +// offers to swap with the goroutine. If the writer is not done with its half, +// we lose the stack trace for this clock tick (and record that loss). +// The goroutine interacts with the signal handler by calling getprofile() to +// get the next log piece to write, implicitly handing back the last log +// piece it obtained. +// +// The state of this dance between the signal handler and the goroutine +// is encoded in the Profile.handoff field. If handoff == 0, then the goroutine +// is not using either log half and is waiting (or will soon be waiting) for +// a new piece by calling notesleep(&p.wait). If the signal handler +// changes handoff from 0 to non-zero, it must call notewakeup(&p.wait) +// to wake the goroutine. The value indicates the number of entries in the +// log half being handed off. The goroutine leaves the non-zero value in +// place until it has finished processing the log half and then flips the number +// back to zero. Setting the high bit in handoff means that the profiling is over, +// and the goroutine is now in charge of flushing the data left in the hash table +// to the log and returning that data. +// +// The handoff field is manipulated using atomic operations. +// For the most part, the manipulation of handoff is orderly: if handoff == 0 +// then the signal handler owns it and can change it to non-zero. +// If handoff != 0 then the goroutine owns it and can change it to zero. +// If that were the end of the story then we would not need to manipulate +// handoff using atomic operations. The operations are needed, however, +// in order to let the log closer set the high bit to indicate "EOF" safely +// in the situation when normally the goroutine "owns" handoff. + +package runtime + +import ( + "runtime/internal/atomic" + "unsafe" +) + +const ( + numBuckets = 1 << 10 + logSize = 1 << 17 + assoc = 4 + maxCPUProfStack = 64 +) + +type cpuprofEntry struct { + count uintptr + depth int + stack [maxCPUProfStack]uintptr +} + +type cpuProfile struct { + on bool // profiling is on + wait note // goroutine waits here + count uintptr // tick count + evicts uintptr // eviction count + lost uintptr // lost ticks that need to be logged + + // Active recent stack traces. + hash [numBuckets]struct { + entry [assoc]cpuprofEntry + } + + // Log of traces evicted from hash. + // Signal handler has filled log[toggle][:nlog]. + // Goroutine is writing log[1-toggle][:handoff]. + log [2][logSize / 2]uintptr + nlog int + toggle int32 + handoff uint32 + + // Writer state. + // Writer maintains its own toggle to avoid races + // looking at signal handler's toggle. + wtoggle uint32 + wholding bool // holding & need to release a log half + flushing bool // flushing hash table - profile is over + eodSent bool // special end-of-data record sent; => flushing +} + +var ( + cpuprofLock mutex + cpuprof *cpuProfile + + eod = [3]uintptr{0, 1, 0} +) + +func setcpuprofilerate(hz int32) { + systemstack(func() { + setcpuprofilerate_m(hz) + }) +} + +// lostProfileData is a no-op function used in profiles +// to mark the number of profiling stack traces that were +// discarded due to slow data writers. +func lostProfileData() {} + +// SetCPUProfileRate sets the CPU profiling rate to hz samples per second. +// If hz <= 0, SetCPUProfileRate turns off profiling. +// If the profiler is on, the rate cannot be changed without first turning it off. +// +// Most clients should use the runtime/pprof package or +// the testing package's -test.cpuprofile flag instead of calling +// SetCPUProfileRate directly. +func SetCPUProfileRate(hz int) { + // Clamp hz to something reasonable. + if hz < 0 { + hz = 0 + } + if hz > 1000000 { + hz = 1000000 + } + + lock(&cpuprofLock) + if hz > 0 { + if cpuprof == nil { + cpuprof = (*cpuProfile)(sysAlloc(unsafe.Sizeof(cpuProfile{}), &memstats.other_sys)) + if cpuprof == nil { + print("runtime: cpu profiling cannot allocate memory\n") + unlock(&cpuprofLock) + return + } + } + if cpuprof.on || cpuprof.handoff != 0 { + print("runtime: cannot set cpu profile rate until previous profile has finished.\n") + unlock(&cpuprofLock) + return + } + + cpuprof.on = true + // pprof binary header format. + // https://github.com/gperftools/gperftools/blob/master/src/profiledata.cc#L119 + p := &cpuprof.log[0] + p[0] = 0 // count for header + p[1] = 3 // depth for header + p[2] = 0 // version number + p[3] = uintptr(1e6 / hz) // period (microseconds) + p[4] = 0 + cpuprof.nlog = 5 + cpuprof.toggle = 0 + cpuprof.wholding = false + cpuprof.wtoggle = 0 + cpuprof.flushing = false + cpuprof.eodSent = false + noteclear(&cpuprof.wait) + + setcpuprofilerate(int32(hz)) + } else if cpuprof != nil && cpuprof.on { + setcpuprofilerate(0) + cpuprof.on = false + + // Now add is not running anymore, and getprofile owns the entire log. + // Set the high bit in cpuprof.handoff to tell getprofile. + for { + n := cpuprof.handoff + if n&0x80000000 != 0 { + print("runtime: setcpuprofile(off) twice\n") + } + if atomic.Cas(&cpuprof.handoff, n, n|0x80000000) { + if n == 0 { + // we did the transition from 0 -> nonzero so we wake getprofile + notewakeup(&cpuprof.wait) + } + break + } + } + } + unlock(&cpuprofLock) +} + +// add adds the stack trace to the profile. +// It is called from signal handlers and other limited environments +// and cannot allocate memory or acquire locks that might be +// held at the time of the signal, nor can it use substantial amounts +// of stack. It is allowed to call evict. +//go:nowritebarrierrec +func (p *cpuProfile) add(pc []uintptr) { + p.addWithFlushlog(pc, p.flushlog) +} + +// addWithFlushlog implements add and addNonGo. +// It is called from signal handlers and other limited environments +// and cannot allocate memory or acquire locks that might be +// held at the time of the signal, nor can it use substantial amounts +// of stack. It may be called by a signal handler with no g or m. +// It is allowed to call evict, passing the flushlog parameter. +//go:nosplit +//go:nowritebarrierrec +func (p *cpuProfile) addWithFlushlog(pc []uintptr, flushlog func() bool) { + if len(pc) > maxCPUProfStack { + pc = pc[:maxCPUProfStack] + } + + // Compute hash. + h := uintptr(0) + for _, x := range pc { + h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1))) + h += x * 41 + } + p.count++ + + // Add to entry count if already present in table. + b := &p.hash[h%numBuckets] +Assoc: + for i := range b.entry { + e := &b.entry[i] + if e.depth != len(pc) { + continue + } + for j := range pc { + if e.stack[j] != pc[j] { + continue Assoc + } + } + e.count++ + return + } + + // Evict entry with smallest count. + var e *cpuprofEntry + for i := range b.entry { + if e == nil || b.entry[i].count < e.count { + e = &b.entry[i] + } + } + if e.count > 0 { + if !p.evict(e, flushlog) { + // Could not evict entry. Record lost stack. + p.lost++ + return + } + p.evicts++ + } + + // Reuse the newly evicted entry. + e.depth = len(pc) + e.count = 1 + copy(e.stack[:], pc) +} + +// evict copies the given entry's data into the log, so that +// the entry can be reused. evict is called from add, which +// is called from the profiling signal handler, so it must not +// allocate memory or block, and it may be called with no g or m. +// It is safe to call flushlog. evict returns true if the entry was +// copied to the log, false if there was no room available. +//go:nosplit +//go:nowritebarrierrec +func (p *cpuProfile) evict(e *cpuprofEntry, flushlog func() bool) bool { + d := e.depth + nslot := d + 2 + log := &p.log[p.toggle] + if p.nlog+nslot > len(log) { + if !flushlog() { + return false + } + log = &p.log[p.toggle] + } + + q := p.nlog + log[q] = e.count + q++ + log[q] = uintptr(d) + q++ + copy(log[q:], e.stack[:d]) + q += d + p.nlog = q + e.count = 0 + return true +} + +// flushlog tries to flush the current log and switch to the other one. +// flushlog is called from evict, called from add, called from the signal handler, +// so it cannot allocate memory or block. It can try to swap logs with +// the writing goroutine, as explained in the comment at the top of this file. +//go:nowritebarrierrec +func (p *cpuProfile) flushlog() bool { + if !atomic.Cas(&p.handoff, 0, uint32(p.nlog)) { + return false + } + notewakeup(&p.wait) + + p.toggle = 1 - p.toggle + log := &p.log[p.toggle] + q := 0 + if p.lost > 0 { + lostPC := funcPC(lostProfileData) + log[0] = p.lost + log[1] = 1 + log[2] = lostPC + q = 3 + p.lost = 0 + } + p.nlog = q + return true +} + +// addNonGo is like add, but runs on a non-Go thread. +// It can't do anything that might need a g or an m. +// With this entry point, we don't try to flush the log when evicting an +// old entry. Instead, we just drop the stack trace if we're out of space. +//go:nosplit +//go:nowritebarrierrec +func (p *cpuProfile) addNonGo(pc []uintptr) { + p.addWithFlushlog(pc, func() bool { return false }) +} + +// getprofile blocks until the next block of profiling data is available +// and returns it as a []byte. It is called from the writing goroutine. +func (p *cpuProfile) getprofile() []byte { + if p == nil { + return nil + } + + if p.wholding { + // Release previous log to signal handling side. + // Loop because we are racing against SetCPUProfileRate(0). + for { + n := p.handoff + if n == 0 { + print("runtime: phase error during cpu profile handoff\n") + return nil + } + if n&0x80000000 != 0 { + p.wtoggle = 1 - p.wtoggle + p.wholding = false + p.flushing = true + goto Flush + } + if atomic.Cas(&p.handoff, n, 0) { + break + } + } + p.wtoggle = 1 - p.wtoggle + p.wholding = false + } + + if p.flushing { + goto Flush + } + + if !p.on && p.handoff == 0 { + return nil + } + + // Wait for new log. + notetsleepg(&p.wait, -1) + noteclear(&p.wait) + + switch n := p.handoff; { + case n == 0: + print("runtime: phase error during cpu profile wait\n") + return nil + case n == 0x80000000: + p.flushing = true + goto Flush + default: + n &^= 0x80000000 + + // Return new log to caller. + p.wholding = true + + return uintptrBytes(p.log[p.wtoggle][:n]) + } + + // In flush mode. + // Add is no longer being called. We own the log. + // Also, p.handoff is non-zero, so flushlog will return false. + // Evict the hash table into the log and return it. +Flush: + for i := range p.hash { + b := &p.hash[i] + for j := range b.entry { + e := &b.entry[j] + if e.count > 0 && !p.evict(e, p.flushlog) { + // Filled the log. Stop the loop and return what we've got. + break Flush + } + } + } + + // Return pending log data. + if p.nlog > 0 { + // Note that we're using toggle now, not wtoggle, + // because we're working on the log directly. + n := p.nlog + p.nlog = 0 + return uintptrBytes(p.log[p.toggle][:n]) + } + + // Made it through the table without finding anything to log. + if !p.eodSent { + // We may not have space to append this to the partial log buf, + // so we always return a new slice for the end-of-data marker. + p.eodSent = true + return uintptrBytes(eod[:]) + } + + // Finally done. Clean up and return nil. + p.flushing = false + if !atomic.Cas(&p.handoff, p.handoff, 0) { + print("runtime: profile flush racing with something\n") + } + return nil +} + +func uintptrBytes(p []uintptr) (ret []byte) { + pp := (*slice)(unsafe.Pointer(&p)) + rp := (*slice)(unsafe.Pointer(&ret)) + + rp.array = pp.array + rp.len = pp.len * int(unsafe.Sizeof(p[0])) + rp.cap = rp.len + + return +} + +// CPUProfile returns the next chunk of binary CPU profiling stack trace data, +// blocking until data is available. If profiling is turned off and all the profile +// data accumulated while it was on has been returned, CPUProfile returns nil. +// The caller must save the returned data before calling CPUProfile again. +// +// Most clients should use the runtime/pprof package or +// the testing package's -test.cpuprofile flag instead of calling +// CPUProfile directly. +func CPUProfile() []byte { + return cpuprof.getprofile() +} + +//go:linkname runtime_pprof_runtime_cyclesPerSecond runtime_pprof.runtime_cyclesPerSecond +func runtime_pprof_runtime_cyclesPerSecond() int64 { + return tickspersecond() +} diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go index 13fc5e5cb0f..30a0f559a92 100644 --- a/libgo/go/runtime/stubs.go +++ b/libgo/go/runtime/stubs.go @@ -415,3 +415,16 @@ func startTheWorld() { func getMstats() *mstats { return &memstats } + +// Temporary for gccgo until we port proc.go. +func setcpuprofilerate_m(hz int32) + +// Temporary for gccgo until we port mem_GOOS.go. +func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer + +// Temporary for gccgo until we port proc.go, so that the C signal +// handler can call into cpuprof. +//go:linkname cpuprofAdd runtime.cpuprofAdd +func cpuprofAdd(stk []uintptr) { + cpuprof.add(stk) +} diff --git a/libgo/runtime/cpuprof.goc b/libgo/runtime/cpuprof.goc deleted file mode 100644 index 123e074666d..00000000000 --- a/libgo/runtime/cpuprof.goc +++ /dev/null @@ -1,442 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// CPU profiling. -// Based on algorithms and data structures used in -// http://code.google.com/p/google-perftools/. -// -// The main difference between this code and the google-perftools -// code is that this code is written to allow copying the profile data -// to an arbitrary io.Writer, while the google-perftools code always -// writes to an operating system file. -// -// The signal handler for the profiling clock tick adds a new stack trace -// to a hash table tracking counts for recent traces. Most clock ticks -// hit in the cache. In the event of a cache miss, an entry must be -// evicted from the hash table, copied to a log that will eventually be -// written as profile data. The google-perftools code flushed the -// log itself during the signal handler. This code cannot do that, because -// the io.Writer might block or need system calls or locks that are not -// safe to use from within the signal handler. Instead, we split the log -// into two halves and let the signal handler fill one half while a goroutine -// is writing out the other half. When the signal handler fills its half, it -// offers to swap with the goroutine. If the writer is not done with its half, -// we lose the stack trace for this clock tick (and record that loss). -// The goroutine interacts with the signal handler by calling getprofile() to -// get the next log piece to write, implicitly handing back the last log -// piece it obtained. -// -// The state of this dance between the signal handler and the goroutine -// is encoded in the Profile.handoff field. If handoff == 0, then the goroutine -// is not using either log half and is waiting (or will soon be waiting) for -// a new piece by calling notesleep(&p->wait). If the signal handler -// changes handoff from 0 to non-zero, it must call notewakeup(&p->wait) -// to wake the goroutine. The value indicates the number of entries in the -// log half being handed off. The goroutine leaves the non-zero value in -// place until it has finished processing the log half and then flips the number -// back to zero. Setting the high bit in handoff means that the profiling is over, -// and the goroutine is now in charge of flushing the data left in the hash table -// to the log and returning that data. -// -// The handoff field is manipulated using atomic operations. -// For the most part, the manipulation of handoff is orderly: if handoff == 0 -// then the signal handler owns it and can change it to non-zero. -// If handoff != 0 then the goroutine owns it and can change it to zero. -// If that were the end of the story then we would not need to manipulate -// handoff using atomic operations. The operations are needed, however, -// in order to let the log closer set the high bit to indicate "EOF" safely -// in the situation when normally the goroutine "owns" handoff. - -package runtime -#include "runtime.h" -#include "arch.h" -#include "malloc.h" - -#include "array.h" -typedef struct __go_open_array Slice; -#define array __values -#define len __count -#define cap __capacity - -enum -{ - HashSize = 1<<10, - LogSize = 1<<17, - Assoc = 4, - MaxStack = 64, -}; - -typedef struct Profile Profile; -typedef struct Bucket Bucket; -typedef struct Entry Entry; - -struct Entry { - uintptr count; - uintptr depth; - uintptr stack[MaxStack]; -}; - -struct Bucket { - Entry entry[Assoc]; -}; - -struct Profile { - bool on; // profiling is on - Note wait; // goroutine waits here - uintptr count; // tick count - uintptr evicts; // eviction count - uintptr lost; // lost ticks that need to be logged - - // Active recent stack traces. - Bucket hash[HashSize]; - - // Log of traces evicted from hash. - // Signal handler has filled log[toggle][:nlog]. - // Goroutine is writing log[1-toggle][:handoff]. - uintptr log[2][LogSize/2]; - uintptr nlog; - int32 toggle; - uint32 handoff; - - // Writer state. - // Writer maintains its own toggle to avoid races - // looking at signal handler's toggle. - uint32 wtoggle; - bool wholding; // holding & need to release a log half - bool flushing; // flushing hash table - profile is over - bool eod_sent; // special end-of-data record sent; => flushing -}; - -static Lock lk; -static Profile *prof; - -static void tick(uintptr*, int32); -static void add(Profile*, uintptr*, int32); -static bool evict(Profile*, Entry*); -static bool flushlog(Profile*); - -static uintptr eod[3] = {0, 1, 0}; - -// LostProfileData is a no-op function used in profiles -// to mark the number of profiling stack traces that were -// discarded due to slow data writers. -static void -LostProfileData(void) -{ -} - -extern void runtime_SetCPUProfileRate(intgo) - __asm__ (GOSYM_PREFIX "runtime.SetCPUProfileRate"); - -// SetCPUProfileRate sets the CPU profiling rate. -// The user documentation is in debug.go. -void -runtime_SetCPUProfileRate(intgo hz) -{ - uintptr *p; - uintptr n; - - // Clamp hz to something reasonable. - if(hz < 0) - hz = 0; - if(hz > 1000000) - hz = 1000000; - - runtime_lock(&lk); - if(hz > 0) { - if(prof == nil) { - prof = runtime_SysAlloc(sizeof *prof, &mstats()->other_sys); - if(prof == nil) { - runtime_printf("runtime: cpu profiling cannot allocate memory\n"); - runtime_unlock(&lk); - return; - } - } - if(prof->on || prof->handoff != 0) { - runtime_printf("runtime: cannot set cpu profile rate until previous profile has finished.\n"); - runtime_unlock(&lk); - return; - } - - prof->on = true; - p = prof->log[0]; - // pprof binary header format. - // http://code.google.com/p/google-perftools/source/browse/trunk/src/profiledata.cc#117 - *p++ = 0; // count for header - *p++ = 3; // depth for header - *p++ = 0; // version number - *p++ = 1000000 / hz; // period (microseconds) - *p++ = 0; - prof->nlog = p - prof->log[0]; - prof->toggle = 0; - prof->wholding = false; - prof->wtoggle = 0; - prof->flushing = false; - prof->eod_sent = false; - runtime_noteclear(&prof->wait); - - runtime_setcpuprofilerate(tick, hz); - } else if(prof != nil && prof->on) { - runtime_setcpuprofilerate(nil, 0); - prof->on = false; - - // Now add is not running anymore, and getprofile owns the entire log. - // Set the high bit in prof->handoff to tell getprofile. - for(;;) { - n = prof->handoff; - if(n&0x80000000) - runtime_printf("runtime: setcpuprofile(off) twice"); - if(runtime_cas(&prof->handoff, n, n|0x80000000)) - break; - } - if(n == 0) { - // we did the transition from 0 -> nonzero so we wake getprofile - runtime_notewakeup(&prof->wait); - } - } - runtime_unlock(&lk); -} - -static void -tick(uintptr *pc, int32 n) -{ - add(prof, pc, n); -} - -// add adds the stack trace to the profile. -// It is called from signal handlers and other limited environments -// and cannot allocate memory or acquire locks that might be -// held at the time of the signal, nor can it use substantial amounts -// of stack. It is allowed to call evict. -static void -add(Profile *p, uintptr *pc, int32 n) -{ - int32 i, j; - uintptr h, x; - Bucket *b; - Entry *e; - - if(n > MaxStack) - n = MaxStack; - - // Compute hash. - h = 0; - for(i=0; i>(8*(sizeof(h)-1))); - x = pc[i]; - h += x*31 + x*7 + x*3; - } - p->count++; - - // Add to entry count if already present in table. - b = &p->hash[h%HashSize]; - for(i=0; ientry[i]; - if(e->depth != (uintptr)n) - continue; - for(j=0; jstack[j] != pc[j]) - goto ContinueAssoc; - e->count++; - return; - ContinueAssoc:; - } - - // Evict entry with smallest count. - e = &b->entry[0]; - for(i=1; ientry[i].count < e->count) - e = &b->entry[i]; - if(e->count > 0) { - if(!evict(p, e)) { - // Could not evict entry. Record lost stack. - p->lost++; - return; - } - p->evicts++; - } - - // Reuse the newly evicted entry. - e->depth = n; - e->count = 1; - for(i=0; istack[i] = pc[i]; -} - -// evict copies the given entry's data into the log, so that -// the entry can be reused. evict is called from add, which -// is called from the profiling signal handler, so it must not -// allocate memory or block. It is safe to call flushLog. -// evict returns true if the entry was copied to the log, -// false if there was no room available. -static bool -evict(Profile *p, Entry *e) -{ - int32 i, d, nslot; - uintptr *log, *q; - - d = e->depth; - nslot = d+2; - log = p->log[p->toggle]; - if(p->nlog+nslot > nelem(p->log[0])) { - if(!flushlog(p)) - return false; - log = p->log[p->toggle]; - } - - q = log+p->nlog; - *q++ = e->count; - *q++ = d; - for(i=0; istack[i]; - p->nlog = q - log; - e->count = 0; - return true; -} - -// flushlog tries to flush the current log and switch to the other one. -// flushlog is called from evict, called from add, called from the signal handler, -// so it cannot allocate memory or block. It can try to swap logs with -// the writing goroutine, as explained in the comment at the top of this file. -static bool -flushlog(Profile *p) -{ - uintptr *log, *q; - - if(!runtime_cas(&p->handoff, 0, p->nlog)) - return false; - runtime_notewakeup(&p->wait); - - p->toggle = 1 - p->toggle; - log = p->log[p->toggle]; - q = log; - if(p->lost > 0) { - *q++ = p->lost; - *q++ = 1; - *q++ = (uintptr)LostProfileData; - p->lost = 0; - } - p->nlog = q - log; - return true; -} - -// getprofile blocks until the next block of profiling data is available -// and returns it as a []byte. It is called from the writing goroutine. -Slice -getprofile(Profile *p) -{ - uint32 i, j, n; - Slice ret; - Bucket *b; - Entry *e; - - ret.array = nil; - ret.len = 0; - ret.cap = 0; - - if(p == nil) - return ret; - - if(p->wholding) { - // Release previous log to signal handling side. - // Loop because we are racing against SetCPUProfileRate(0). - for(;;) { - n = p->handoff; - if(n == 0) { - runtime_printf("runtime: phase error during cpu profile handoff\n"); - return ret; - } - if(n & 0x80000000) { - p->wtoggle = 1 - p->wtoggle; - p->wholding = false; - p->flushing = true; - goto flush; - } - if(runtime_cas(&p->handoff, n, 0)) - break; - } - p->wtoggle = 1 - p->wtoggle; - p->wholding = false; - } - - if(p->flushing) - goto flush; - - if(!p->on && p->handoff == 0) - return ret; - - // Wait for new log. - runtime_notetsleepg(&p->wait, -1); - runtime_noteclear(&p->wait); - - n = p->handoff; - if(n == 0) { - runtime_printf("runtime: phase error during cpu profile wait\n"); - return ret; - } - if(n == 0x80000000) { - p->flushing = true; - goto flush; - } - n &= ~0x80000000; - - // Return new log to caller. - p->wholding = true; - - ret.array = (byte*)p->log[p->wtoggle]; - ret.len = n*sizeof(uintptr); - ret.cap = ret.len; - return ret; - -flush: - // In flush mode. - // Add is no longer being called. We own the log. - // Also, p->handoff is non-zero, so flushlog will return false. - // Evict the hash table into the log and return it. - for(i=0; ihash[i]; - for(j=0; jentry[j]; - if(e->count > 0 && !evict(p, e)) { - // Filled the log. Stop the loop and return what we've got. - goto breakflush; - } - } - } -breakflush: - - // Return pending log data. - if(p->nlog > 0) { - // Note that we're using toggle now, not wtoggle, - // because we're working on the log directly. - ret.array = (byte*)p->log[p->toggle]; - ret.len = p->nlog*sizeof(uintptr); - ret.cap = ret.len; - p->nlog = 0; - return ret; - } - - // Made it through the table without finding anything to log. - if(!p->eod_sent) { - // We may not have space to append this to the partial log buf, - // so we always return a new slice for the end-of-data marker. - p->eod_sent = true; - ret.array = (byte*)eod; - ret.len = sizeof eod; - ret.cap = ret.len; - return ret; - } - - // Finally done. Clean up and return nil. - p->flushing = false; - if(!runtime_cas(&p->handoff, p->handoff, 0)) - runtime_printf("runtime: profile flush racing with something\n"); - return ret; // set to nil at top of function -} - -// CPUProfile returns the next cpu profile block as a []byte. -// The user documentation is in debug.go. -func CPUProfile() (ret Slice) { - ret = getprofile(prof); -} diff --git a/libgo/runtime/go-signal.c b/libgo/runtime/go-signal.c index 0aef2fc9b08..99829eb6385 100644 --- a/libgo/runtime/go-signal.c +++ b/libgo/runtime/go-signal.c @@ -156,6 +156,8 @@ runtime_sighandler (int sig, Siginfo *info, #ifdef SIGPROF if (sig == SIGPROF) { + /* FIXME: Handle m == NULL by calling something like gc's + sigprofNonGo. */ if (m != NULL && gp != m->g0 && gp != m->gsignal) runtime_sigprof (); return; diff --git a/libgo/runtime/malloc.h b/libgo/runtime/malloc.h index 5e74b8c1f1a..12a25b57677 100644 --- a/libgo/runtime/malloc.h +++ b/libgo/runtime/malloc.h @@ -184,7 +184,8 @@ enum // SysFault marks a (already SysAlloc'd) region to fault // if accessed. Used only for debugging the runtime. -void* runtime_SysAlloc(uintptr nbytes, uint64 *stat); +void* runtime_SysAlloc(uintptr nbytes, uint64 *stat) + __asm__ (GOSYM_PREFIX "runtime.sysAlloc"); void runtime_SysFree(void *v, uintptr nbytes, uint64 *stat); void runtime_SysUnused(void *v, uintptr nbytes); void runtime_SysUsed(void *v, uintptr nbytes); diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c index d8a26fd77ad..246ab7d1b02 100644 --- a/libgo/runtime/proc.c +++ b/libgo/runtime/proc.c @@ -2686,11 +2686,8 @@ runtime_mcount(void) } static struct { - Lock; - void (*fn)(uintptr*, int32); + uint32 lock; int32 hz; - uintptr pcbuf[TracebackMaxFrames]; - Location locbuf[TracebackMaxFrames]; } prof; static void System(void) {} @@ -2703,8 +2700,11 @@ runtime_sigprof() M *mp = g->m; int32 n, i; bool traceback; + uintptr pcbuf[TracebackMaxFrames]; + Location locbuf[TracebackMaxFrames]; + Slice stk; - if(prof.fn == nil || prof.hz == 0) + if(prof.hz == 0) return; if(mp == nil) @@ -2718,12 +2718,6 @@ runtime_sigprof() if(mp->mcache == nil) traceback = false; - runtime_lock(&prof); - if(prof.fn == nil) { - runtime_unlock(&prof); - mp->mallocing--; - return; - } n = 0; if(runtime_atomicload(&runtime_in_callers) > 0) { @@ -2735,34 +2729,44 @@ runtime_sigprof() } if(traceback) { - n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf), false); + n = runtime_callers(0, locbuf, nelem(locbuf), false); for(i = 0; i < n; i++) - prof.pcbuf[i] = prof.locbuf[i].pc; + pcbuf[i] = locbuf[i].pc; } if(!traceback || n <= 0) { n = 2; - prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n); + pcbuf[0] = (uintptr)runtime_getcallerpc(&n); if(mp->gcing || mp->helpgc) - prof.pcbuf[1] = (uintptr)GC; + pcbuf[1] = (uintptr)GC; else - prof.pcbuf[1] = (uintptr)System; + pcbuf[1] = (uintptr)System; + } + + if (prof.hz != 0) { + stk.__values = &pcbuf[0]; + stk.__count = n; + stk.__capacity = n; + + // Simple cas-lock to coordinate with setcpuprofilerate. + while (!runtime_cas(&prof.lock, 0, 1)) { + runtime_osyield(); + } + if (prof.hz != 0) { + runtime_cpuprofAdd(stk); + } + runtime_atomicstore(&prof.lock, 0); } - prof.fn(prof.pcbuf, n); - runtime_unlock(&prof); + mp->mallocing--; } // Arrange to call fn with a traceback hz times a second. void -runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz) +runtime_setcpuprofilerate_m(int32 hz) { // Force sane arguments. if(hz < 0) hz = 0; - if(hz == 0) - fn = nil; - if(fn == nil) - hz = 0; // Disable preemption, otherwise we can be rescheduled to another thread // that has profiling enabled. @@ -2773,10 +2777,12 @@ runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz) // it would deadlock. runtime_resetcpuprofiler(0); - runtime_lock(&prof); - prof.fn = fn; + while (!runtime_cas(&prof.lock, 0, 1)) { + runtime_osyield(); + } prof.hz = hz; - runtime_unlock(&prof); + runtime_atomicstore(&prof.lock, 0); + runtime_lock(&runtime_sched); runtime_sched.profilehz = hz; runtime_unlock(&runtime_sched); diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h index b7e59021e40..96f550ced0b 100644 --- a/libgo/runtime/runtime.h +++ b/libgo/runtime/runtime.h @@ -417,7 +417,10 @@ void runtime_freezetheworld(void); void runtime_unwindstack(G*, byte*); void runtime_sigprof(); void runtime_resetcpuprofiler(int32); -void runtime_setcpuprofilerate(void(*)(uintptr*, int32), int32); +void runtime_setcpuprofilerate_m(int32) + __asm__ (GOSYM_PREFIX "runtime.setcpuprofilerate_m"); +void runtime_cpuprofAdd(Slice) + __asm__ (GOSYM_PREFIX "runtime.cpuprofAdd"); void runtime_usleep(uint32) __asm__ (GOSYM_PREFIX "runtime.usleep"); int64 runtime_cputicks(void) diff --git a/libgo/runtime/runtime1.goc b/libgo/runtime/runtime1.goc index 2238980e8c3..bc5ba4ad539 100644 --- a/libgo/runtime/runtime1.goc +++ b/libgo/runtime/runtime1.goc @@ -55,10 +55,6 @@ func getgoroot() (out String) { out = runtime_getenv("GOROOT"); } -func runtime_pprof.runtime_cyclesPerSecond() (res int64) { - res = runtime_tickspersecond(); -} - func sync.runtime_procPin() (p int) { M *mp; -- 2.30.2