runtime: move schedt type and sched var from C to Go
authorIan Lance Taylor <ian@gcc.gnu.org>
Fri, 18 Nov 2016 17:48:29 +0000 (17:48 +0000)
committerIan Lance Taylor <ian@gcc.gnu.org>
Fri, 18 Nov 2016 17:48:29 +0000 (17:48 +0000)
    This doesn't change any actual code, it just starts using the Go
    definition of the schedt type and the sched variable rather than the C
    definitions.

    The schedt type is tweaked slightly for gccgo.  We aren't going to
    release goroutine stacks, so we don't need separate gfreeStack and
    gfreeNostack lists.  We only have one size of defer function, so we
    don't need a list of 5 different pools.

    Reviewed-on: https://go-review.googlesource.com/33364

From-SVN: r242600

gcc/go/gofrontend/MERGE
libgo/go/runtime/runtime2.go
libgo/go/runtime/stubs.go
libgo/runtime/proc.c

index 6e29f40af134006d83f9143683d94c3c8e98cd51..f0cbae6c3c2d66e159754d490c05e7c929a777f8 100644 (file)
@@ -1,4 +1,4 @@
-fc4ca600b2fc6de81fd3c4014542d6a50593db1a
+bf4762823c4543229867436399be3ae30b4d13bb
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
index f3bbb589d1c986d03971f379aaba75e78a7b0e4d..7a0cc4316e4358027dc3607ed4305f7515eab89d 100644 (file)
@@ -550,9 +550,6 @@ const (
        _MaxGomaxprocs = 1 << 8
 )
 
-/*
-Commented out for gccgo for now.
-
 type schedt struct {
        // accessed atomically. keep at top to ensure alignment on 32-bit systems.
        goidgen  uint64
@@ -578,18 +575,17 @@ type schedt struct {
        runqsize int32
 
        // Global cache of dead G's.
-       gflock       mutex
-       gfreeStack   *g
-       gfreeNoStack *g
-       ngfree       int32
+       gflock mutex
+       gfree  *g
+       ngfree int32
 
        // Central cache of sudog structs.
        sudoglock  mutex
        sudogcache *sudog
 
-       // Central pool of available defer structs of different sizes.
+       // Central pool of available defer structs.
        deferlock mutex
-       deferpool [5]*_defer
+       deferpool *_defer
 
        gcwaiting  uint32 // gc is waiting to run
        stopwait   int32
@@ -608,7 +604,6 @@ type schedt struct {
        procresizetime int64 // nanotime() of last change to gomaxprocs
        totaltime      int64 // âˆ«gomaxprocs dt up to procresizetime
 }
-*/
 
 // The m.locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
 // The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
@@ -772,8 +767,10 @@ var (
 
        ncpu int32
 
-//     forcegc     forcegcstate
-//     sched       schedt
+       //      forcegc     forcegcstate
+
+       sched schedt
+
 //     newprocs    int32
 
 // Information about what cpu features are available.
index 8d90cd6061f5ff1d49edf8af6283585d4c8414d5..b01bc7c315315f3b5d483f131e3d17b924cd6d65 100644 (file)
@@ -520,3 +520,9 @@ func dumpregs(*_siginfo_t, unsafe.Pointer)
 
 // Temporary for gccgo until we port panic.go.
 func startpanic()
+
+// Temporary for gccgo until we port proc.go.
+//go:linkname getsched runtime.getsched
+func getsched() *schedt {
+       return &sched
+}
index 43ced39c0f76b08252c3b10743b3dbbb2a7b3be3..d8834ad485f98db33d372f0a6e26350ff43421b1 100644 (file)
@@ -351,48 +351,18 @@ runtime_mcall(void (*pfn)(G*))
 //
 // Design doc at http://golang.org/s/go11sched.
 
-typedef struct Sched Sched;
-struct Sched {
-       Lock;
-
-       uint64  goidgen;
-       M*      midle;   // idle m's waiting for work
-       int32   nmidle;  // number of idle m's waiting for work
-       int32   nmidlelocked; // number of locked m's waiting for work
-       int32   mcount;  // number of m's that have been created
-       int32   maxmcount;      // maximum number of m's allowed (or die)
-
-       P*      pidle;  // idle P's
-       uint32  npidle;
-       uint32  nmspinning;
-
-       // Global runnable queue.
-       G*      runqhead;
-       G*      runqtail;
-       int32   runqsize;
-
-       // Global cache of dead G's.
-       Lock    gflock;
-       G*      gfree;
-
-       uint32  gcwaiting;      // gc is waiting to run
-       int32   stopwait;
-       Note    stopnote;
-       uint32  sysmonwait;
-       Note    sysmonnote;
-       uint64  lastpoll;
-
-       int32   profilehz;      // cpu profiling rate
-};
+typedef struct schedt Sched;
 
 enum
 {
-       // Number of goroutine ids to grab from runtime_sched.goidgen to local per-P cache at once.
+       // Number of goroutine ids to grab from runtime_sched->goidgen to local per-P cache at once.
        // 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
        GoidCacheBatch = 16,
 };
 
-Sched  runtime_sched;
+extern Sched* runtime_getsched() __asm__ (GOSYM_PREFIX "runtime.getsched");
+
+static Sched*  runtime_sched;
 int32  runtime_gomaxprocs;
 uint32 runtime_needextram = 1;
 M      runtime_m0;
@@ -471,6 +441,8 @@ runtime_schedinit(void)
        const byte *p;
        Eface i;
 
+       runtime_sched = runtime_getsched();
+
        m = &runtime_m0;
        g = &runtime_g0;
        m->g0 = g;
@@ -479,7 +451,7 @@ runtime_schedinit(void)
 
        initcontext();
 
-       runtime_sched.maxmcount = 10000;
+       runtime_sched->maxmcount = 10000;
        runtime_precisestack = 0;
 
        // runtime_symtabinit();
@@ -500,7 +472,7 @@ runtime_schedinit(void)
        runtime_goenvs();
        runtime_parsedebugvars();
 
-       runtime_sched.lastpoll = runtime_nanotime();
+       runtime_sched->lastpoll = runtime_nanotime();
        procs = 1;
        s = runtime_getenv("GOMAXPROCS");
        p = s.str;
@@ -747,8 +719,8 @@ static void
 checkmcount(void)
 {
        // sched lock is held
-       if(runtime_sched.mcount > runtime_sched.maxmcount) {
-               runtime_printf("runtime: program exceeds %d-thread limit\n", runtime_sched.maxmcount);
+       if(runtime_sched->mcount > runtime_sched->maxmcount) {
+               runtime_printf("runtime: program exceeds %d-thread limit\n", runtime_sched->maxmcount);
                runtime_throw("thread exhaustion");
        }
 }
@@ -782,8 +754,8 @@ mcommoninit(M *mp)
 
        mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
 
-       runtime_lock(&runtime_sched);
-       mp->id = runtime_sched.mcount++;
+       runtime_lock(&runtime_sched->lock);
+       mp->id = runtime_sched->mcount++;
        checkmcount();
        runtime_mpreinit(mp);
 
@@ -793,7 +765,7 @@ mcommoninit(M *mp)
        // runtime_NumCgoCall() iterates over allm w/o schedlock,
        // so we need to publish it safely.
        runtime_atomicstorep(&runtime_allm, mp);
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
 }
 
 // Mark gp ready to run.
@@ -808,7 +780,7 @@ runtime_ready(G *gp)
        }
        gp->atomicstatus = _Grunnable;
        runqput((P*)g->m->p, gp);
-       if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0)  // TODO: fast atomic
+       if(runtime_atomicload(&runtime_sched->npidle) != 0 && runtime_atomicload(&runtime_sched->nmspinning) == 0)  // TODO: fast atomic
                wakep();
        g->m->locks--;
 }
@@ -828,15 +800,15 @@ runtime_gcprocs(void)
 
        // Figure out how many CPUs to use during GC.
        // Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        n = runtime_gomaxprocs;
        if(n > runtime_ncpu)
                n = runtime_ncpu > 0 ? runtime_ncpu : 1;
        if(n > MaxGcproc)
                n = MaxGcproc;
-       if(n > runtime_sched.nmidle+1) // one M is currently running
-               n = runtime_sched.nmidle+1;
-       runtime_unlock(&runtime_sched);
+       if(n > runtime_sched->nmidle+1) // one M is currently running
+               n = runtime_sched->nmidle+1;
+       runtime_unlock(&runtime_sched->lock);
        return n;
 }
 
@@ -845,14 +817,14 @@ needaddgcproc(void)
 {
        int32 n;
 
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        n = runtime_gomaxprocs;
        if(n > runtime_ncpu)
                n = runtime_ncpu;
        if(n > MaxGcproc)
                n = MaxGcproc;
-       n -= runtime_sched.nmidle+1; // one M is currently running
-       runtime_unlock(&runtime_sched);
+       n -= runtime_sched->nmidle+1; // one M is currently running
+       runtime_unlock(&runtime_sched->lock);
        return n > 0;
 }
 
@@ -862,7 +834,7 @@ runtime_helpgc(int32 nproc)
        M *mp;
        int32 n, pos;
 
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        pos = 0;
        for(n = 1; n < nproc; n++) {  // one M is currently running
                if(runtime_allp[pos]->mcache == g->m->mcache)
@@ -875,7 +847,7 @@ runtime_helpgc(int32 nproc)
                pos++;
                runtime_notewakeup(&mp->park);
        }
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
 }
 
 // Similar to stoptheworld but best-effort and can be called several times.
@@ -893,8 +865,8 @@ runtime_freezetheworld(void)
        // so try several times
        for(i = 0; i < 5; i++) {
                // this should tell the scheduler to not start any new goroutines
-               runtime_sched.stopwait = 0x7fffffff;
-               runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
+               runtime_sched->stopwait = 0x7fffffff;
+               runtime_atomicstore((uint32*)&runtime_sched->gcwaiting, 1);
                // this should stop running goroutines
                if(!preemptall())
                        break;  // no running goroutines
@@ -914,34 +886,34 @@ runtime_stopTheWorldWithSema(void)
        P *p;
        bool wait;
 
-       runtime_lock(&runtime_sched);
-       runtime_sched.stopwait = runtime_gomaxprocs;
-       runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
+       runtime_lock(&runtime_sched->lock);
+       runtime_sched->stopwait = runtime_gomaxprocs;
+       runtime_atomicstore((uint32*)&runtime_sched->gcwaiting, 1);
        preemptall();
        // stop current P
        ((P*)g->m->p)->status = _Pgcstop;
-       runtime_sched.stopwait--;
+       runtime_sched->stopwait--;
        // try to retake all P's in _Psyscall status
        for(i = 0; i < runtime_gomaxprocs; i++) {
                p = runtime_allp[i];
                s = p->status;
                if(s == _Psyscall && runtime_cas(&p->status, s, _Pgcstop))
-                       runtime_sched.stopwait--;
+                       runtime_sched->stopwait--;
        }
        // stop idle P's
        while((p = pidleget()) != nil) {
                p->status = _Pgcstop;
-               runtime_sched.stopwait--;
+               runtime_sched->stopwait--;
        }
-       wait = runtime_sched.stopwait > 0;
-       runtime_unlock(&runtime_sched);
+       wait = runtime_sched->stopwait > 0;
+       runtime_unlock(&runtime_sched->lock);
 
        // wait for remaining P's to stop voluntarily
        if(wait) {
-               runtime_notesleep(&runtime_sched.stopnote);
-               runtime_noteclear(&runtime_sched.stopnote);
+               runtime_notesleep(&runtime_sched->stopnote);
+               runtime_noteclear(&runtime_sched->stopnote);
        }
-       if(runtime_sched.stopwait)
+       if(runtime_sched->stopwait)
                runtime_throw("stoptheworld: not stopped");
        for(i = 0; i < runtime_gomaxprocs; i++) {
                p = runtime_allp[i];
@@ -968,13 +940,13 @@ runtime_startTheWorldWithSema(void)
        gp = runtime_netpoll(false);  // non-blocking
        injectglist(gp);
        add = needaddgcproc();
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        if(newprocs) {
                procresize(newprocs);
                newprocs = 0;
        } else
                procresize(runtime_gomaxprocs);
-       runtime_sched.gcwaiting = 0;
+       runtime_sched->gcwaiting = 0;
 
        p1 = nil;
        while((p = pidleget()) != nil) {
@@ -988,11 +960,11 @@ runtime_startTheWorldWithSema(void)
                p->link = (uintptr)p1;
                p1 = p;
        }
-       if(runtime_sched.sysmonwait) {
-               runtime_sched.sysmonwait = false;
-               runtime_notewakeup(&runtime_sched.sysmonnote);
+       if(runtime_sched->sysmonwait) {
+               runtime_sched->sysmonwait = false;
+               runtime_notewakeup(&runtime_sched->sysmonnote);
        }
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
 
        while(p1) {
                p = p1;
@@ -1285,7 +1257,7 @@ runtime_newextram(void)
        mp->locked = _LockInternal;
        mp->lockedg = gp;
        gp->lockedm = mp;
-       gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
+       gp->goid = runtime_xadd64(&runtime_sched->goidgen, 1);
        // put on allg for garbage collector
        allgadd(gp);
 
@@ -1439,13 +1411,13 @@ stopm(void)
                runtime_throw("stopm holding p");
        if(m->spinning) {
                m->spinning = false;
-               runtime_xadd(&runtime_sched.nmspinning, -1);
+               runtime_xadd(&runtime_sched->nmspinning, -1);
        }
 
 retry:
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        mput(m);
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
        runtime_notesleep(&m->park);
        m = g->m;
        runtime_noteclear(&m->park);
@@ -1473,18 +1445,18 @@ startm(P *p, bool spinning)
        M *mp;
        void (*fn)(void);
 
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        if(p == nil) {
                p = pidleget();
                if(p == nil) {
-                       runtime_unlock(&runtime_sched);
+                       runtime_unlock(&runtime_sched->lock);
                        if(spinning)
-                               runtime_xadd(&runtime_sched.nmspinning, -1);
+                               runtime_xadd(&runtime_sched->nmspinning, -1);
                        return;
                }
        }
        mp = mget();
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
        if(mp == nil) {
                fn = nil;
                if(spinning)
@@ -1506,39 +1478,39 @@ static void
 handoffp(P *p)
 {
        // if it has local work, start it straight away
-       if(p->runqhead != p->runqtail || runtime_sched.runqsize) {
+       if(p->runqhead != p->runqtail || runtime_sched->runqsize) {
                startm(p, false);
                return;
        }
        // no local work, check that there are no spinning/idle M's,
        // otherwise our help is not required
-       if(runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) == 0 &&  // TODO: fast atomic
-               runtime_cas(&runtime_sched.nmspinning, 0, 1)) {
+       if(runtime_atomicload(&runtime_sched->nmspinning) + runtime_atomicload(&runtime_sched->npidle) == 0 &&  // TODO: fast atomic
+               runtime_cas(&runtime_sched->nmspinning, 0, 1)) {
                startm(p, true);
                return;
        }
-       runtime_lock(&runtime_sched);
-       if(runtime_sched.gcwaiting) {
+       runtime_lock(&runtime_sched->lock);
+       if(runtime_sched->gcwaiting) {
                p->status = _Pgcstop;
-               if(--runtime_sched.stopwait == 0)
-                       runtime_notewakeup(&runtime_sched.stopnote);
-               runtime_unlock(&runtime_sched);
+               if(--runtime_sched->stopwait == 0)
+                       runtime_notewakeup(&runtime_sched->stopnote);
+               runtime_unlock(&runtime_sched->lock);
                return;
        }
-       if(runtime_sched.runqsize) {
-               runtime_unlock(&runtime_sched);
+       if(runtime_sched->runqsize) {
+               runtime_unlock(&runtime_sched->lock);
                startm(p, false);
                return;
        }
        // If this is the last running P and nobody is polling network,
        // need to wakeup another M to poll network.
-       if(runtime_sched.npidle == (uint32)runtime_gomaxprocs-1 && runtime_atomicload64(&runtime_sched.lastpoll) != 0) {
-               runtime_unlock(&runtime_sched);
+       if(runtime_sched->npidle == (uint32)runtime_gomaxprocs-1 && runtime_atomicload64(&runtime_sched->lastpoll) != 0) {
+               runtime_unlock(&runtime_sched->lock);
                startm(p, false);
                return;
        }
        pidleput(p);
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
 }
 
 // Tries to add one more P to execute G's.
@@ -1547,7 +1519,7 @@ static void
 wakep(void)
 {
        // be conservative about spinning threads
-       if(!runtime_cas(&runtime_sched.nmspinning, 0, 1))
+       if(!runtime_cas(&runtime_sched->nmspinning, 0, 1))
                return;
        startm(nil, true);
 }
@@ -1606,18 +1578,18 @@ gcstopm(void)
 {
        P *p;
 
-       if(!runtime_sched.gcwaiting)
+       if(!runtime_sched->gcwaiting)
                runtime_throw("gcstopm: not waiting for gc");
        if(g->m->spinning) {
                g->m->spinning = false;
-               runtime_xadd(&runtime_sched.nmspinning, -1);
+               runtime_xadd(&runtime_sched->nmspinning, -1);
        }
        p = releasep();
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        p->status = _Pgcstop;
-       if(--runtime_sched.stopwait == 0)
-               runtime_notewakeup(&runtime_sched.stopnote);
-       runtime_unlock(&runtime_sched);
+       if(--runtime_sched->stopwait == 0)
+               runtime_notewakeup(&runtime_sched->stopnote);
+       runtime_unlock(&runtime_sched->lock);
        stopm();
 }
 
@@ -1639,7 +1611,7 @@ execute(G *gp)
        gp->m = g->m;
 
        // Check whether the profiler needs to be turned on or off.
-       hz = runtime_sched.profilehz;
+       hz = runtime_sched->profilehz;
        if(g->m->profilehz != hz)
                runtime_resetcpuprofiler(hz);
 
@@ -1656,7 +1628,7 @@ findrunnable(void)
        int32 i;
 
 top:
-       if(runtime_sched.gcwaiting) {
+       if(runtime_sched->gcwaiting) {
                gcstopm();
                goto top;
        }
@@ -1667,10 +1639,10 @@ top:
        if(gp)
                return gp;
        // global runq
-       if(runtime_sched.runqsize) {
-               runtime_lock(&runtime_sched);
+       if(runtime_sched->runqsize) {
+               runtime_lock(&runtime_sched->lock);
                gp = globrunqget((P*)g->m->p, 0);
-               runtime_unlock(&runtime_sched);
+               runtime_unlock(&runtime_sched->lock);
                if(gp)
                        return gp;
        }
@@ -1684,15 +1656,15 @@ top:
        // If number of spinning M's >= number of busy P's, block.
        // This is necessary to prevent excessive CPU consumption
        // when GOMAXPROCS>>1 but the program parallelism is low.
-       if(!g->m->spinning && 2 * runtime_atomicload(&runtime_sched.nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched.npidle))  // TODO: fast atomic
+       if(!g->m->spinning && 2 * runtime_atomicload(&runtime_sched->nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched->npidle))  // TODO: fast atomic
                goto stop;
        if(!g->m->spinning) {
                g->m->spinning = true;
-               runtime_xadd(&runtime_sched.nmspinning, 1);
+               runtime_xadd(&runtime_sched->nmspinning, 1);
        }
        // random steal from other P's
        for(i = 0; i < 2*runtime_gomaxprocs; i++) {
-               if(runtime_sched.gcwaiting)
+               if(runtime_sched->gcwaiting)
                        goto top;
                p = runtime_allp[runtime_fastrand1()%runtime_gomaxprocs];
                if(p == (P*)g->m->p)
@@ -1704,30 +1676,30 @@ top:
        }
 stop:
        // return P and block
-       runtime_lock(&runtime_sched);
-       if(runtime_sched.gcwaiting) {
-               runtime_unlock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
+       if(runtime_sched->gcwaiting) {
+               runtime_unlock(&runtime_sched->lock);
                goto top;
        }
-       if(runtime_sched.runqsize) {
+       if(runtime_sched->runqsize) {
                gp = globrunqget((P*)g->m->p, 0);
-               runtime_unlock(&runtime_sched);
+               runtime_unlock(&runtime_sched->lock);
                return gp;
        }
        p = releasep();
        pidleput(p);
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
        if(g->m->spinning) {
                g->m->spinning = false;
-               runtime_xadd(&runtime_sched.nmspinning, -1);
+               runtime_xadd(&runtime_sched->nmspinning, -1);
        }
        // check all runqueues once again
        for(i = 0; i < runtime_gomaxprocs; i++) {
                p = runtime_allp[i];
                if(p && p->runqhead != p->runqtail) {
-                       runtime_lock(&runtime_sched);
+                       runtime_lock(&runtime_sched->lock);
                        p = pidleget();
-                       runtime_unlock(&runtime_sched);
+                       runtime_unlock(&runtime_sched->lock);
                        if(p) {
                                acquirep(p);
                                goto top;
@@ -1736,17 +1708,17 @@ stop:
                }
        }
        // poll network
-       if(runtime_xchg64(&runtime_sched.lastpoll, 0) != 0) {
+       if(runtime_xchg64(&runtime_sched->lastpoll, 0) != 0) {
                if(g->m->p)
                        runtime_throw("findrunnable: netpoll with p");
                if(g->m->spinning)
                        runtime_throw("findrunnable: netpoll with spinning");
                gp = runtime_netpoll(true);  // block until new work is available
-               runtime_atomicstore64(&runtime_sched.lastpoll, runtime_nanotime());
+               runtime_atomicstore64(&runtime_sched->lastpoll, runtime_nanotime());
                if(gp) {
-                       runtime_lock(&runtime_sched);
+                       runtime_lock(&runtime_sched->lock);
                        p = pidleget();
-                       runtime_unlock(&runtime_sched);
+                       runtime_unlock(&runtime_sched->lock);
                        if(p) {
                                acquirep(p);
                                injectglist((G*)gp->schedlink);
@@ -1767,15 +1739,15 @@ resetspinning(void)
 
        if(g->m->spinning) {
                g->m->spinning = false;
-               nmspinning = runtime_xadd(&runtime_sched.nmspinning, -1);
+               nmspinning = runtime_xadd(&runtime_sched->nmspinning, -1);
                if(nmspinning < 0)
                        runtime_throw("findrunnable: negative nmspinning");
        } else
-               nmspinning = runtime_atomicload(&runtime_sched.nmspinning);
+               nmspinning = runtime_atomicload(&runtime_sched->nmspinning);
 
        // M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
        // so see if we need to wakeup another P here.
-       if (nmspinning == 0 && runtime_atomicload(&runtime_sched.npidle) > 0)
+       if (nmspinning == 0 && runtime_atomicload(&runtime_sched->npidle) > 0)
                wakep();
 }
 
@@ -1789,16 +1761,16 @@ injectglist(G *glist)
 
        if(glist == nil)
                return;
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        for(n = 0; glist; n++) {
                gp = glist;
                glist = (G*)gp->schedlink;
                gp->atomicstatus = _Grunnable;
                globrunqput(gp);
        }
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
 
-       for(; n && runtime_sched.npidle; n--)
+       for(; n && runtime_sched->npidle; n--)
                startm(nil, false);
 }
 
@@ -1814,7 +1786,7 @@ schedule(void)
                runtime_throw("schedule: holding locks");
 
 top:
-       if(runtime_sched.gcwaiting) {
+       if(runtime_sched->gcwaiting) {
                gcstopm();
                goto top;
        }
@@ -1826,10 +1798,10 @@ top:
        tick = ((P*)g->m->p)->schedtick;
        // This is a fancy way to say tick%61==0,
        // it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
-       if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched.runqsize > 0) {
-               runtime_lock(&runtime_sched);
+       if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched->runqsize > 0) {
+               runtime_lock(&runtime_sched->lock);
                gp = globrunqget((P*)g->m->p, 1);
-               runtime_unlock(&runtime_sched);
+               runtime_unlock(&runtime_sched->lock);
                if(gp)
                        resetspinning();
        }
@@ -1959,9 +1931,9 @@ runtime_gosched0(G *gp)
        gp->atomicstatus = _Grunnable;
        gp->m = nil;
        m->curg = nil;
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        globrunqput(gp);
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
        if(m->lockedg) {
                stoplockedm();
                execute(gp);  // Never returns.
@@ -2074,25 +2046,25 @@ doentersyscall(uintptr pc, uintptr sp)
 
        g->atomicstatus = _Gsyscall;
 
-       if(runtime_atomicload(&runtime_sched.sysmonwait)) {  // TODO: fast atomic
-               runtime_lock(&runtime_sched);
-               if(runtime_atomicload(&runtime_sched.sysmonwait)) {
-                       runtime_atomicstore(&runtime_sched.sysmonwait, 0);
-                       runtime_notewakeup(&runtime_sched.sysmonnote);
+       if(runtime_atomicload(&runtime_sched->sysmonwait)) {  // TODO: fast atomic
+               runtime_lock(&runtime_sched->lock);
+               if(runtime_atomicload(&runtime_sched->sysmonwait)) {
+                       runtime_atomicstore(&runtime_sched->sysmonwait, 0);
+                       runtime_notewakeup(&runtime_sched->sysmonnote);
                }
-               runtime_unlock(&runtime_sched);
+               runtime_unlock(&runtime_sched->lock);
        }
 
        g->m->mcache = nil;
        ((P*)(g->m->p))->m = 0;
        runtime_atomicstore(&((P*)g->m->p)->status, _Psyscall);
-       if(runtime_atomicload(&runtime_sched.gcwaiting)) {
-               runtime_lock(&runtime_sched);
-               if (runtime_sched.stopwait > 0 && runtime_cas(&((P*)g->m->p)->status, _Psyscall, _Pgcstop)) {
-                       if(--runtime_sched.stopwait == 0)
-                               runtime_notewakeup(&runtime_sched.stopnote);
+       if(runtime_atomicload(&runtime_sched->gcwaiting)) {
+               runtime_lock(&runtime_sched->lock);
+               if (runtime_sched->stopwait > 0 && runtime_cas(&((P*)g->m->p)->status, _Psyscall, _Pgcstop)) {
+                       if(--runtime_sched->stopwait == 0)
+                               runtime_notewakeup(&runtime_sched->stopnote);
                }
-               runtime_unlock(&runtime_sched);
+               runtime_unlock(&runtime_sched->lock);
        }
 
        g->m->locks--;
@@ -2201,7 +2173,7 @@ exitsyscallfast(void)
        gp = g;
 
        // Freezetheworld sets stopwait but does not retake P's.
-       if(runtime_sched.stopwait) {
+       if(runtime_sched->stopwait) {
                gp->m->p = 0;
                return false;
        }
@@ -2215,14 +2187,14 @@ exitsyscallfast(void)
        }
        // Try to get any other idle P.
        gp->m->p = 0;
-       if(runtime_sched.pidle) {
-               runtime_lock(&runtime_sched);
+       if(runtime_sched->pidle) {
+               runtime_lock(&runtime_sched->lock);
                p = pidleget();
-               if(p && runtime_atomicload(&runtime_sched.sysmonwait)) {
-                       runtime_atomicstore(&runtime_sched.sysmonwait, 0);
-                       runtime_notewakeup(&runtime_sched.sysmonnote);
+               if(p && runtime_atomicload(&runtime_sched->sysmonwait)) {
+                       runtime_atomicstore(&runtime_sched->sysmonwait, 0);
+                       runtime_notewakeup(&runtime_sched->sysmonnote);
                }
-               runtime_unlock(&runtime_sched);
+               runtime_unlock(&runtime_sched->lock);
                if(p) {
                        acquirep(p);
                        return true;
@@ -2243,15 +2215,15 @@ exitsyscall0(G *gp)
        gp->atomicstatus = _Grunnable;
        gp->m = nil;
        m->curg = nil;
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        p = pidleget();
        if(p == nil)
                globrunqput(gp);
-       else if(runtime_atomicload(&runtime_sched.sysmonwait)) {
-               runtime_atomicstore(&runtime_sched.sysmonwait, 0);
-               runtime_notewakeup(&runtime_sched.sysmonnote);
+       else if(runtime_atomicload(&runtime_sched->sysmonwait)) {
+               runtime_atomicstore(&runtime_sched->sysmonwait, 0);
+               runtime_notewakeup(&runtime_sched->sysmonnote);
        }
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
        if(p) {
                acquirep(p);
                execute(gp);  // Never returns.
@@ -2308,7 +2280,7 @@ syscall_runtime_AfterFork(void)
 {
        int32 hz;
 
-       hz = runtime_sched.profilehz;
+       hz = runtime_sched->profilehz;
        if(hz != 0)
                runtime_resetcpuprofiler(hz);
        runtime_m()->locks--;
@@ -2400,7 +2372,7 @@ __go_go(void (*fn)(void*), void* arg)
        newg->gopc = (uintptr)__builtin_return_address(0);
        newg->atomicstatus = _Grunnable;
        if(p->goidcache == p->goidcacheend) {
-               p->goidcache = runtime_xadd64(&runtime_sched.goidgen, GoidCacheBatch);
+               p->goidcache = runtime_xadd64(&runtime_sched->goidgen, GoidCacheBatch);
                p->goidcacheend = p->goidcache + GoidCacheBatch;
        }
        newg->goid = p->goidcache++;
@@ -2421,7 +2393,7 @@ __go_go(void (*fn)(void*), void* arg)
 
                runqput(p, vnewg);
 
-               if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
+               if(runtime_atomicload(&runtime_sched->npidle) != 0 && runtime_atomicload(&runtime_sched->nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
                        wakep();
                g->m->locks--;
                return vnewg;
@@ -2462,15 +2434,15 @@ gfput(P *p, G *gp)
        p->gfree = gp;
        p->gfreecnt++;
        if(p->gfreecnt >= 64) {
-               runtime_lock(&runtime_sched.gflock);
+               runtime_lock(&runtime_sched->gflock);
                while(p->gfreecnt >= 32) {
                        p->gfreecnt--;
                        gp = p->gfree;
                        p->gfree = (G*)gp->schedlink;
-                       gp->schedlink = (uintptr)runtime_sched.gfree;
-                       runtime_sched.gfree = gp;
+                       gp->schedlink = (uintptr)runtime_sched->gfree;
+                       runtime_sched->gfree = gp;
                }
-               runtime_unlock(&runtime_sched.gflock);
+               runtime_unlock(&runtime_sched->gflock);
        }
 }
 
@@ -2483,16 +2455,16 @@ gfget(P *p)
 
 retry:
        gp = p->gfree;
-       if(gp == nil && runtime_sched.gfree) {
-               runtime_lock(&runtime_sched.gflock);
-               while(p->gfreecnt < 32 && runtime_sched.gfree) {
+       if(gp == nil && runtime_sched->gfree) {
+               runtime_lock(&runtime_sched->gflock);
+               while(p->gfreecnt < 32 && runtime_sched->gfree) {
                        p->gfreecnt++;
-                       gp = runtime_sched.gfree;
-                       runtime_sched.gfree = (G*)gp->schedlink;
+                       gp = runtime_sched->gfree;
+                       runtime_sched->gfree = (G*)gp->schedlink;
                        gp->schedlink = (uintptr)p->gfree;
                        p->gfree = gp;
                }
-               runtime_unlock(&runtime_sched.gflock);
+               runtime_unlock(&runtime_sched->gflock);
                goto retry;
        }
        if(gp) {
@@ -2508,15 +2480,15 @@ gfpurge(P *p)
 {
        G *gp;
 
-       runtime_lock(&runtime_sched.gflock);
+       runtime_lock(&runtime_sched->gflock);
        while(p->gfreecnt) {
                p->gfreecnt--;
                gp = p->gfree;
                p->gfree = (G*)gp->schedlink;
-               gp->schedlink = (uintptr)runtime_sched.gfree;
-               runtime_sched.gfree = gp;
+               gp->schedlink = (uintptr)runtime_sched->gfree;
+               runtime_sched->gfree = gp;
        }
-       runtime_unlock(&runtime_sched.gflock);
+       runtime_unlock(&runtime_sched->gflock);
 }
 
 void
@@ -2546,13 +2518,13 @@ runtime_GOMAXPROCS(intgo n)
 
        if(n > _MaxGomaxprocs)
                n = _MaxGomaxprocs;
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        ret = (intgo)runtime_gomaxprocs;
        if(n <= 0 || n == ret) {
-               runtime_unlock(&runtime_sched);
+               runtime_unlock(&runtime_sched->lock);
                return ret;
        }
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
 
        runtime_acquireWorldsema();
        g->m->gcing = 1;
@@ -2653,7 +2625,7 @@ runtime_gcount(void)
 int32
 runtime_mcount(void)
 {
-       return runtime_sched.mcount;
+       return runtime_sched->mcount;
 }
 
 static struct {
@@ -2754,9 +2726,9 @@ runtime_setcpuprofilerate_m(int32 hz)
        prof.hz = hz;
        runtime_atomicstore(&prof.lock, 0);
 
-       runtime_lock(&runtime_sched);
-       runtime_sched.profilehz = hz;
-       runtime_unlock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
+       runtime_sched->profilehz = hz;
+       runtime_unlock(&runtime_sched->lock);
 
        if(hz != 0)
                runtime_resetcpuprofiler(hz);
@@ -2809,22 +2781,22 @@ procresize(int32 new)
                        p->runqtail--;
                        gp = (G*)p->runq[p->runqtail%nelem(p->runq)];
                        // push onto head of global queue
-                       gp->schedlink = (uintptr)runtime_sched.runqhead;
-                       runtime_sched.runqhead = gp;
-                       if(runtime_sched.runqtail == nil)
-                               runtime_sched.runqtail = gp;
-                       runtime_sched.runqsize++;
+                       gp->schedlink = runtime_sched->runqhead;
+                       runtime_sched->runqhead = (uintptr)gp;
+                       if(runtime_sched->runqtail == 0)
+                               runtime_sched->runqtail = (uintptr)gp;
+                       runtime_sched->runqsize++;
                }
        }
        // fill local queues with at most nelem(p->runq)/2 goroutines
        // start at 1 because current M already executes some G and will acquire allp[0] below,
        // so if we have a spare G we want to put it into allp[1].
-       for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched.runqsize > 0; i++) {
-               gp = runtime_sched.runqhead;
-               runtime_sched.runqhead = (G*)gp->schedlink;
-               if(runtime_sched.runqhead == nil)
-                       runtime_sched.runqtail = nil;
-               runtime_sched.runqsize--;
+       for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched->runqsize > 0; i++) {
+               gp = (G*)runtime_sched->runqhead;
+               runtime_sched->runqhead = gp->schedlink;
+               if(runtime_sched->runqhead == 0)
+                       runtime_sched->runqtail = 0;
+               runtime_sched->runqsize--;
                runqput(runtime_allp[i%new], gp);
        }
 
@@ -2899,11 +2871,11 @@ releasep(void)
 static void
 incidlelocked(int32 v)
 {
-       runtime_lock(&runtime_sched);
-       runtime_sched.nmidlelocked += v;
+       runtime_lock(&runtime_sched->lock);
+       runtime_sched->nmidlelocked += v;
        if(v > 0)
                checkdead();
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
 }
 
 // Check for deadlock situation.
@@ -2923,7 +2895,7 @@ checkdead(void)
        }
 
        // -1 for sysmon
-       run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra();
+       run = runtime_sched->mcount - runtime_sched->nmidle - runtime_sched->nmidlelocked - 1 - countextra();
        if(run > 0)
                return;
        // If we are dying because of a signal caught on an already idle thread,
@@ -2934,7 +2906,7 @@ checkdead(void)
                return;
        if(run < 0) {
                runtime_printf("runtime: checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
-                       runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount);
+                       runtime_sched->nmidle, runtime_sched->nmidlelocked, runtime_sched->mcount);
                runtime_throw("checkdead: inconsistent counts");
        }
        grunning = 0;
@@ -2978,23 +2950,23 @@ sysmon(void)
                        delay = 10*1000;
                runtime_usleep(delay);
                if(runtime_debug.schedtrace <= 0 &&
-                       (runtime_sched.gcwaiting || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs)) {  // TODO: fast atomic
-                       runtime_lock(&runtime_sched);
-                       if(runtime_atomicload(&runtime_sched.gcwaiting) || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {
-                               runtime_atomicstore(&runtime_sched.sysmonwait, 1);
-                               runtime_unlock(&runtime_sched);
-                               runtime_notesleep(&runtime_sched.sysmonnote);
-                               runtime_noteclear(&runtime_sched.sysmonnote);
+                       (runtime_sched->gcwaiting || runtime_atomicload(&runtime_sched->npidle) == (uint32)runtime_gomaxprocs)) {  // TODO: fast atomic
+                       runtime_lock(&runtime_sched->lock);
+                       if(runtime_atomicload(&runtime_sched->gcwaiting) || runtime_atomicload(&runtime_sched->npidle) == (uint32)runtime_gomaxprocs) {
+                               runtime_atomicstore(&runtime_sched->sysmonwait, 1);
+                               runtime_unlock(&runtime_sched->lock);
+                               runtime_notesleep(&runtime_sched->sysmonnote);
+                               runtime_noteclear(&runtime_sched->sysmonnote);
                                idle = 0;
                                delay = 20;
                        } else
-                               runtime_unlock(&runtime_sched);
+                               runtime_unlock(&runtime_sched->lock);
                }
                // poll network if not polled for more than 10ms
-               lastpoll = runtime_atomicload64(&runtime_sched.lastpoll);
+               lastpoll = runtime_atomicload64(&runtime_sched->lastpoll);
                now = runtime_nanotime();
                if(lastpoll != 0 && lastpoll + 10*1000*1000 < now) {
-                       runtime_cas64(&runtime_sched.lastpoll, lastpoll, now);
+                       runtime_cas64(&runtime_sched->lastpoll, lastpoll, now);
                        gp = runtime_netpoll(false);  // non-blocking
                        if(gp) {
                                // Need to decrement number of idle locked M's
@@ -3060,7 +3032,7 @@ retake(int64 now)
                        // but on the other hand we want to retake them eventually
                        // because they can prevent the sysmon thread from deep sleep.
                        if(p->runqhead == p->runqtail &&
-                               runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0 &&
+                               runtime_atomicload(&runtime_sched->nmspinning) + runtime_atomicload(&runtime_sched->npidle) > 0 &&
                                pd->syscallwhen + 10*1000*1000 > now)
                                continue;
                        // Need to decrement number of idle locked M's
@@ -3117,14 +3089,14 @@ runtime_schedtrace(bool detailed)
        if(starttime == 0)
                starttime = now;
 
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        runtime_printf("SCHED %Dms: gomaxprocs=%d idleprocs=%d threads=%d idlethreads=%d runqueue=%d",
-               (now-starttime)/1000000, runtime_gomaxprocs, runtime_sched.npidle, runtime_sched.mcount,
-               runtime_sched.nmidle, runtime_sched.runqsize);
+               (now-starttime)/1000000, runtime_gomaxprocs, runtime_sched->npidle, runtime_sched->mcount,
+               runtime_sched->nmidle, runtime_sched->runqsize);
        if(detailed) {
                runtime_printf(" gcwaiting=%d nmidlelocked=%d nmspinning=%d stopwait=%d sysmonwait=%d\n",
-                       runtime_sched.gcwaiting, runtime_sched.nmidlelocked, runtime_sched.nmspinning,
-                       runtime_sched.stopwait, runtime_sched.sysmonwait);
+                       runtime_sched->gcwaiting, runtime_sched->nmidlelocked, runtime_sched->nmspinning,
+                       runtime_sched->stopwait, runtime_sched->sysmonwait);
        }
        // We must be careful while reading data from P's, M's and G's.
        // Even if we hold schedlock, most data can be changed concurrently.
@@ -3153,7 +3125,7 @@ runtime_schedtrace(bool detailed)
                }
        }
        if(!detailed) {
-               runtime_unlock(&runtime_sched);
+               runtime_unlock(&runtime_sched->lock);
                return;
        }
        for(mp = runtime_allm; mp; mp = mp->alllink) {
@@ -3185,7 +3157,7 @@ runtime_schedtrace(bool detailed)
                        lockedm ? lockedm->id : -1);
        }
        runtime_unlock(&allglock);
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
 }
 
 // Put mp on midle list.
@@ -3193,9 +3165,9 @@ runtime_schedtrace(bool detailed)
 static void
 mput(M *mp)
 {
-       mp->schedlink = (uintptr)runtime_sched.midle;
-       runtime_sched.midle = mp;
-       runtime_sched.nmidle++;
+       mp->schedlink = runtime_sched->midle;
+       runtime_sched->midle = (uintptr)mp;
+       runtime_sched->nmidle++;
        checkdead();
 }
 
@@ -3206,9 +3178,9 @@ mget(void)
 {
        M *mp;
 
-       if((mp = runtime_sched.midle) != nil){
-               runtime_sched.midle = (M*)mp->schedlink;
-               runtime_sched.nmidle--;
+       if((mp = (M*)runtime_sched->midle) != nil){
+               runtime_sched->midle = mp->schedlink;
+               runtime_sched->nmidle--;
        }
        return mp;
 }
@@ -3219,12 +3191,12 @@ static void
 globrunqput(G *gp)
 {
        gp->schedlink = 0;
-       if(runtime_sched.runqtail)
-               runtime_sched.runqtail->schedlink = (uintptr)gp;
+       if(runtime_sched->runqtail)
+               ((G*)runtime_sched->runqtail)->schedlink = (uintptr)gp;
        else
-               runtime_sched.runqhead = gp;
-       runtime_sched.runqtail = gp;
-       runtime_sched.runqsize++;
+               runtime_sched->runqhead = (uintptr)gp;
+       runtime_sched->runqtail = (uintptr)gp;
+       runtime_sched->runqsize++;
 }
 
 // Put a batch of runnable goroutines on the global runnable queue.
@@ -3233,12 +3205,12 @@ static void
 globrunqputbatch(G *ghead, G *gtail, int32 n)
 {
        gtail->schedlink = 0;
-       if(runtime_sched.runqtail)
-               runtime_sched.runqtail->schedlink = (uintptr)ghead;
+       if(runtime_sched->runqtail)
+               ((G*)runtime_sched->runqtail)->schedlink = (uintptr)ghead;
        else
-               runtime_sched.runqhead = ghead;
-       runtime_sched.runqtail = gtail;
-       runtime_sched.runqsize += n;
+               runtime_sched->runqhead = (uintptr)ghead;
+       runtime_sched->runqtail = (uintptr)gtail;
+       runtime_sched->runqsize += n;
 }
 
 // Try get a batch of G's from the global runnable queue.
@@ -3249,24 +3221,24 @@ globrunqget(P *p, int32 max)
        G *gp, *gp1;
        int32 n;
 
-       if(runtime_sched.runqsize == 0)
+       if(runtime_sched->runqsize == 0)
                return nil;
-       n = runtime_sched.runqsize/runtime_gomaxprocs+1;
-       if(n > runtime_sched.runqsize)
-               n = runtime_sched.runqsize;
+       n = runtime_sched->runqsize/runtime_gomaxprocs+1;
+       if(n > runtime_sched->runqsize)
+               n = runtime_sched->runqsize;
        if(max > 0 && n > max)
                n = max;
        if((uint32)n > nelem(p->runq)/2)
                n = nelem(p->runq)/2;
-       runtime_sched.runqsize -= n;
-       if(runtime_sched.runqsize == 0)
-               runtime_sched.runqtail = nil;
-       gp = runtime_sched.runqhead;
-       runtime_sched.runqhead = (G*)gp->schedlink;
+       runtime_sched->runqsize -= n;
+       if(runtime_sched->runqsize == 0)
+               runtime_sched->runqtail = 0;
+       gp = (G*)runtime_sched->runqhead;
+       runtime_sched->runqhead = gp->schedlink;
        n--;
        while(n--) {
-               gp1 = runtime_sched.runqhead;
-               runtime_sched.runqhead = (G*)gp1->schedlink;
+               gp1 = (G*)runtime_sched->runqhead;
+               runtime_sched->runqhead = gp1->schedlink;
                runqput(p, gp1);
        }
        return gp;
@@ -3277,9 +3249,9 @@ globrunqget(P *p, int32 max)
 static void
 pidleput(P *p)
 {
-       p->link = (uintptr)runtime_sched.pidle;
-       runtime_sched.pidle = p;
-       runtime_xadd(&runtime_sched.npidle, 1);  // TODO: fast atomic
+       p->link = runtime_sched->pidle;
+       runtime_sched->pidle = (uintptr)p;
+       runtime_xadd(&runtime_sched->npidle, 1);  // TODO: fast atomic
 }
 
 // Try get a p from pidle list.
@@ -3289,10 +3261,10 @@ pidleget(void)
 {
        P *p;
 
-       p = runtime_sched.pidle;
+       p = (P*)runtime_sched->pidle;
        if(p) {
-               runtime_sched.pidle = (P*)p->link;
-               runtime_xadd(&runtime_sched.npidle, -1);  // TODO: fast atomic
+               runtime_sched->pidle = p->link;
+               runtime_xadd(&runtime_sched->npidle, -1);  // TODO: fast atomic
        }
        return p;
 }
@@ -3341,9 +3313,9 @@ runqputslow(P *p, G *gp, uint32 h, uint32 t)
        for(i=0; i<n; i++)
                batch[i]->schedlink = (uintptr)batch[i+1];
        // Now put the batch on global queue.
-       runtime_lock(&runtime_sched);
+       runtime_lock(&runtime_sched->lock);
        globrunqputbatch(batch[0], batch[n], n+1);
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
        return true;
 }
 
@@ -3495,11 +3467,11 @@ runtime_setmaxthreads(intgo in)
 {
        intgo out;
 
-       runtime_lock(&runtime_sched);
-       out = (intgo)runtime_sched.maxmcount;
-       runtime_sched.maxmcount = (int32)in;
+       runtime_lock(&runtime_sched->lock);
+       out = (intgo)runtime_sched->maxmcount;
+       runtime_sched->maxmcount = (int32)in;
        checkmcount();
-       runtime_unlock(&runtime_sched);
+       runtime_unlock(&runtime_sched->lock);
        return out;
 }
 
@@ -3558,7 +3530,6 @@ sync_atomic_runtime_procUnpin()
 void
 runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
 {
-       enqueue1(wbufp, (Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
        enqueue1(wbufp, (Obj){(byte*)&runtime_main_init_done, sizeof runtime_main_init_done, 0});
 }
 
@@ -3567,7 +3538,7 @@ runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj
 bool
 runtime_gcwaiting(void)
 {
-       return runtime_sched.gcwaiting;
+       return runtime_sched->gcwaiting;
 }
 
 // os_beforeExit is called from os.Exit(0).
@@ -3602,7 +3573,7 @@ sync_runtime_canSpin(intgo i)
        // GOMAXPROCS>1 and there is at least one other running P and local runq is empty.
        // As opposed to runtime mutex we don't do passive spinning here,
        // because there can be work on global runq on on other Ps.
-       if (i >= ACTIVE_SPIN || runtime_ncpu <= 1 || runtime_gomaxprocs <= (int32)(runtime_sched.npidle+runtime_sched.nmspinning)+1) {
+       if (i >= ACTIVE_SPIN || runtime_ncpu <= 1 || runtime_gomaxprocs <= (int32)(runtime_sched->npidle+runtime_sched->nmspinning)+1) {
                return false;
        }
        p = (P*)g->m->p;