Ticket #3553 (closed bug: fixed)

Opened 4 years ago

Last modified 3 years ago

parallel gc suffers badly if one thread is descheduled

Reported by: simonmar Owned by: simonmar
Priority: normal Milestone: 6.12.2
Component: Runtime System Version: 6.10.4
Keywords: Cc: sveina@…
Operating System: Unknown/Multiple Architecture: Unknown/Multiple
Type of failure: None/Unknown Difficulty: Unknown
Test Case: Blocked By:
Blocking: Related Tickets:

Description

The parallel GC synchronisation uses pure spinlocks, which leads to a severe decline in performance when one thread is descheduled. This is the main cause of the "last core parallel slowdown": using a -N value that matches the number of cores in the machine can be slower than using one fewer. The effect seems to be quite bad on Linux, reports are that it is less of an issue on OS X.

Switching to mutexes would help, but it isn't easy because we sometimes unlock these from a different thread than they were locked from, and standard mutexes don't let you do that (the locks in question are mut_spin and gc_spin in the GcThread structure).

Attachments

futex-spinlocks.patch.bz2 Download (207.0 KB) - added by simonmar 3 years ago.

Change History

Changed 4 years ago by simonmar

I tried using condition variables, but it was significantly slower than spinlocks. Experimental patch attached (or would be, if it wasn't so large).

Tue Oct  6 16:32:58 BST 2009  Simon Marlow <marlowsd@gmail.com>
  * EXPERIMENT: use condition variables instead of spinlocks for the GC barrier
  This was significantly slower, averaging +20% with 7 cores on
  nofib/parallel.
    {
    hunk ./rts/sm/GC.c 132
    +#if defined(THREADED_RTS)
    +Mutex     gc_threads_ready_lock;
    +Condition gc_threads_ready_cond;
    +nat       gc_threads_ready;
    +
    +Mutex     gc_threads_go_lock;
    +Condition gc_threads_go_cond;
    +rtsBool   gc_threads_go;
    +#endif
    +
    hunk ./rts/sm/GC.c 885
    -    initSpinLock(&t->gc_spin);
    -    initSpinLock(&t->mut_spin);
    -    ACQUIRE_SPIN_LOCK(&t->gc_spin);
    hunk ./rts/sm/GC.c 937
    +
    +        initMutex(&gc_threads_ready_lock);
    +        initCondition(&gc_threads_ready_cond);
    +        gc_threads_ready = 0;
    +        initMutex(&gc_threads_go_lock);
    +        initCondition(&gc_threads_go_cond);
    +        gc_threads_go = rtsFalse;
    hunk ./rts/sm/GC.c 1094
    -    // Wait until we're told to wake up
    -    RELEASE_SPIN_LOCK(&gct->mut_spin);
    hunk ./rts/sm/GC.c 1095
    +
    +    // Wait until we're told to wake up
    +    ACQUIRE_LOCK(&gc_threads_ready_lock);
    +    gc_threads_ready++;
    +    debugTrace(DEBUG_gc, "GC thread %d: gc_threads_ready = %d", gct->thread_index, gc_threads_ready);
    +    if (gc_threads_ready >= RtsFlags.ParFlags.nNodes-1) {
    +        signalCondition(&gc_threads_ready_cond);
    +    }
    +    RELEASE_LOCK(&gc_threads_ready_lock);
    +
    hunk ./rts/sm/GC.c 1106
    -    ACQUIRE_SPIN_LOCK(&gct->gc_spin);
    +
    +    ACQUIRE_LOCK(&gc_threads_go_lock);
    +    while (!gc_threads_go) {
    +        waitCondition(&gc_threads_go_cond, &gc_threads_go_lock);
    +    }
    +    RELEASE_LOCK(&gc_threads_go_lock);
    hunk ./rts/sm/GC.c 1134
    -    // Wait until we're told to continue
    -    RELEASE_SPIN_LOCK(&gct->gc_spin);
    hunk ./rts/sm/GC.c 1135
    +
    +    // Wait until we're told to continue
    +    ACQUIRE_LOCK(&gc_threads_ready_lock);
    +    gc_threads_ready++;
    +    if (gc_threads_ready >= RtsFlags.ParFlags.nNodes-1) {
    +        signalCondition(&gc_threads_ready_cond);
    +    }
    +    RELEASE_LOCK(&gc_threads_ready_lock);
    +
    hunk ./rts/sm/GC.c 1146
    -    ACQUIRE_SPIN_LOCK(&gct->mut_spin);
    +
    +    ACQUIRE_LOCK(&gc_threads_go_lock);
    +    while (gc_threads_go) {
    +        waitCondition(&gc_threads_go_cond, &gc_threads_go_lock);
    +    }
    +    RELEASE_LOCK(&gc_threads_go_lock);
    +
    hunk ./rts/sm/GC.c 1155
    +    gct->wakeup = GC_THREAD_INACTIVE;
    +
    hunk ./rts/sm/GC.c 1169
    -    nat i, j;
    +    nat i;
    hunk ./rts/sm/GC.c 1172
    -    while(retry) {
    -        for (i=0; i < n_threads; i++) {
    -            if (i == me) continue;
    -            if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
    -                prodCapability(&capabilities[i], cap->running_task);
    -            }
    -        }
    -        for (j=0; j < 10000000; j++) {
    -            retry = rtsFalse;
    -            for (i=0; i < n_threads; i++) {
    -                if (i == me) continue;
    -                write_barrier();
    -                setContextSwitches();
    -                if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
    -                    retry = rtsTrue;
    -                }
    -            }
    -            if (!retry) break;
    +    for (i=0; i < n_threads; i++) {
    +        if (i == me) continue;
    +        if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
    +            prodCapability(&capabilities[i], cap->running_task);
    hunk ./rts/sm/GC.c 1178
    +    setContextSwitches();
    +    $
    +    gc_threads_go = rtsFalse;
    +    $
    +    ACQUIRE_LOCK(&gc_threads_ready_lock);
    +    while (gc_threads_ready < n_threads-1) {
    +        debugTrace(DEBUG_gc, "waitForGcThreads: gc_threads_ready = %d", gc_threads_ready);
    +        waitCondition(&gc_threads_ready_cond, &gc_threads_ready_lock);
    +    } $
    +    gc_threads_ready = 0;
    +    RELEASE_LOCK(&gc_threads_ready_lock);
    hunk ./rts/sm/GC.c 1213
    -        ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
    -        RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
    +//        ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
    +//        RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
    hunk ./rts/sm/GC.c 1216
    +
    +    ACQUIRE_LOCK(&gc_threads_go_lock);
    +    gc_threads_go = rtsTrue;
    +    broadcastCondition(&gc_threads_go_cond);
    +    RELEASE_LOCK(&gc_threads_go_lock);
    hunk ./rts/sm/GC.c 1231
    -    nat i;
    -    for (i=0; i < n_threads; i++) {
    -        if (i == me) continue;
    -        while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) { write_barrier(); }
    +    ACQUIRE_LOCK(&gc_threads_ready_lock);
    +    while (gc_threads_ready < n_threads-1) {
    +        debugTrace(DEBUG_gc, "shutdown_gc_threads: %d", gc_threads_ready);
    +        waitCondition(&gc_threads_ready_cond, &gc_threads_ready_lock);
    hunk ./rts/sm/GC.c 1236
    +    gc_threads_ready = 0;
    +    RELEASE_LOCK(&gc_threads_ready_lock);
    hunk ./rts/sm/GC.c 1243
    -releaseGCThreads (Capability *cap USED_IF_THREADS)
    +releaseGCThreads (Capability *cap STG_UNUSED)
    hunk ./rts/sm/GC.c 1245
    -    nat n_threads = RtsFlags.ParFlags.nNodes;
    -    nat me = cap->no;
    -    nat i;
    -    for (i=0; i < n_threads; i++) {
    -        if (i == me) continue;
    -        if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) $
    -            barf("releaseGCThreads");
    -        $
    -        gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
    -        ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
    -        RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
    -    }
    +    debugTrace(DEBUG_gc, "releaseGCThreads");
    +    ACQUIRE_LOCK(&gc_threads_go_lock);
    +    gc_threads_go = rtsFalse;
    +    broadcastCondition(&gc_threads_go_cond);
    +    RELEASE_LOCK(&gc_threads_go_lock);
    hunk ./rts/sm/GCThread.h 119
    -    SpinLock   gc_spin;
    -    SpinLock   mut_spin;
    }

Changed 3 years ago by simonmar

  • owner set to igloo
  • failure set to None/Unknown
  • type changed from bug to merge

This patch helps a lot:

Fri Jan 22 16:49:11 GMT 2010  Simon Marlow <marlowsd@gmail.com>
  * When acquiring a spinlock, yieldThread() every 1000 spins (#3553, #3758)
  
  This helps when the thread holding the lock has been descheduled,
  which is the main cause of the "last-core slowdown" problem.  With
  this patch, I get much better results with -N8 on an 8-core box,
  although some benchmarks are still worse than with 7 cores.
  
  I also added a yieldThread() into the any_work() loop of the parallel
  GC when it has no work to do. Oddly, this seems to improve performance
  on the parallel GC benchmarks even when all the cores are busy.
  Perhaps it is due to reducing contention on the memory bus.

See  this blog post for more info.

I'm going to close the bug, since I think this is the best we can do until we implement on-the-fly minor GCs.

Changed 3 years ago by wuzzeb

I would be careful about adding sched_yield. Ingo Molnar, the developer who wrote and maintains the linux scheduler, says he has never seen a valid use of sched_yield. See  http://kerneltrap.org/Linux/Using_sched_yield_Improperly for the discussion. At least with the CFS scheduler in linux, using sched_yield might not always do what you expect.

Also, from your blog post at least, it seems like futexes are perfect for what you need. In the uncontended case, they are completely in userspace. They can be acquired on one thread and released on another thread. Any reasons why futexes don't work?

 http://people.redhat.com/drepper/futex.pdf  http://en.wikipedia.org/wiki/Futex

Changed 3 years ago by simonmar

I tried using futexes today, and so far haven't managed to beat the performance of the sched_yield spinlocks, but I might not be using them right. I fully expect futexes to be better. (also, futexes are not exactly a real API. The syscall isn't even exposed by glibc, you have to define it manually, as I discovered with some help from folks on #ghc).

I can entirely believe that sched_yield is never the absolute right thing, but it might be the best portable solution we have.

Changed 3 years ago by simonmar

Changed 3 years ago by simonmar

Patch to use futexes attached. This is significantly slower than the sched_yield version currently in use. I don't know why - as far as I can tell I'm using futexes correctly. The protocol I'm using is from Drepper's paper, and I tried it with and without some user-space spinning in the acquire case.

nofib/parallel/ray on 8 cores, first with futexes and then with yield:

$ ./ray 1000 +RTS -N8 -s >/dev/null
  14,784,695,584 bytes allocated in the heap
     246,403,264 bytes copied during GC
         108,232 bytes maximum residency (169 sample(s))
         310,000 bytes maximum slop
               6 MB total memory in use (0 MB lost due to fragmentation)

  Generation 0:  4606 collections,  4605 parallel,  4.31s,  2.16s elapsed
  Generation 1:   169 collections,   169 parallel,  0.22s,  0.08s elapsed

  Parallel GC work balance: 1.56 (30214391 / 19430130, ideal 8)

  SPARKS: 1000000 (978174 converted, 21636 pruned)

  INIT  time    0.00s  (  0.00s elapsed)
  MUT   time    8.96s  (  1.86s elapsed)
  GC    time    4.53s  (  2.24s elapsed)
  EXIT  time    0.00s  (  0.00s elapsed)
  Total time   13.49s  (  4.11s elapsed)


$ ./ray-yield 1000 +RTS -N8 -s >/dev/null 
  14,834,802,304 bytes allocated in the heap
     237,105,080 bytes copied during GC
          97,736 bytes maximum residency (158 sample(s))
         299,160 bytes maximum slop
               6 MB total memory in use (0 MB lost due to fragmentation)

  Generation 0:  4515 collections,  4514 parallel,  7.73s,  1.65s elapsed
  Generation 1:   158 collections,   158 parallel,  0.39s,  0.06s elapsed

  Parallel GC work balance: 1.53 (29092959 / 18954020, ideal 8)

  SPARKS: 1000000 (980491 converted, 19356 pruned)

  INIT  time    0.00s  (  0.00s elapsed)
  MUT   time   10.74s  (  1.93s elapsed)
  GC    time    8.11s  (  1.71s elapsed)
  EXIT  time    0.00s  (  0.00s elapsed)
  Total time   18.86s  (  3.64s elapsed)

The extra CPU time in the yield version is due to the higher spin threshold, but I've tried different spin thresholds in the futex case and it didn't help.

Changed 3 years ago by igloo

  • owner changed from igloo to simonmar
  • type changed from merge to bug

Patch merged. Simon, I'm bouncing it back to you in case you want it left open for the futex investigation.

Changed 3 years ago by simonmar

  • status changed from new to closed
  • resolution set to fixed

I'm not going to investigate this any more.

Changed 3 years ago by wuzzeb

I know you said you weren't going to investigate further and the ticket is closed, so I am just adding a pointer to some info for future investigation if someone decides to look into it again (I might myself if I become more familiar with ghc internals).

There is a recent patch to the linux kernel to implement a futux which spins in the kernel, see  http://thread.gmane.org/gmane.linux.kernel/970412

You can spin in the kernel until the lock is released or you are scheduled out, might be closer to what ghc wants.

Changed 3 years ago by Baughn

  • cc sveina@… added
Note: See TracTickets for help on using tickets.