diff mbox

libgo patch committed: Update to current Go library

Message ID mcr3915q06b.fsf@google.com
State New
Headers show

Commit Message

Ian Lance Taylor Oct. 23, 2012, 4:31 a.m. UTC
I have committed a patch to update the mainline version of libgo to the
current master Go library sources.  At this point I will only be
updating the gcc 4.7 branch for bug fixes.

This is a substantial patch that brings in several months of work.  As
usual I am not posting the complete patch here, as it is mostly simply
copies of changes to the upstream repository.  I have attached the
changes to gccgo-specific files and files with lots of gccgo-specific
changes.

There is a decent change that this will break something on non-x86
systems.  I will do what testing I am able to do after the commit.

Bootstrapped and ran Go testsuite on x86_64-unknown-linux-gnu.
Committed to mainline.

Ian
diff mbox

Patch

diff -r bf12a7f41b67 go/gogo.cc
--- a/go/gogo.cc	Sun Oct 07 21:29:09 2012 -0700
+++ b/go/gogo.cc	Mon Oct 22 17:36:23 2012 -0700
@@ -1251,6 +1251,7 @@ 
   this->package_->bindings()->clear_file_scope();
 
   // Warn about packages which were imported but not used.
+  bool quiet = saw_errors();
   for (Packages::iterator p = this->packages_.begin();
        p != this->packages_.end();
        ++p)
@@ -1260,7 +1261,7 @@ 
 	  && package->is_imported()
 	  && !package->used()
 	  && !package->uses_sink_alias()
-	  && !saw_errors())
+	  && !quiet)
 	error_at(package->location(), "imported and not used: %s",
 		 Gogo::message_name(package->package_name()).c_str());
       package->clear_is_imported();
diff -r bf12a7f41b67 go/runtime.cc
--- a/go/runtime.cc	Sun Oct 07 21:29:09 2012 -0700
+++ b/go/runtime.cc	Mon Oct 22 17:36:23 2012 -0700
@@ -32,6 +32,8 @@ 
   RFT_BOOLPTR,
   // Go type int, C type int.
   RFT_INT,
+  // Go type int32, C type int32_t.
+  RFT_INT32,
   // Go type int64, C type int64_t.
   RFT_INT64,
   // Go type uint64, C type uint64_t.
@@ -102,6 +104,10 @@ 
 	  t = Type::lookup_integer_type("int");
 	  break;
 
+	case RFT_INT32:
+	  t = Type::lookup_integer_type("int32");
+	  break;
+
 	case RFT_INT64:
 	  t = Type::lookup_integer_type("int64");
 	  break;
@@ -206,6 +212,7 @@ 
     case RFT_BOOL:
     case RFT_BOOLPTR:
     case RFT_INT:
+    case RFT_INT32:
     case RFT_INT64:
     case RFT_UINT64:
     case RFT_UINTPTR:
diff -r bf12a7f41b67 go/runtime.def
--- a/go/runtime.def	Sun Oct 07 21:29:09 2012 -0700
+++ b/go/runtime.def	Mon Oct 22 17:36:23 2012 -0700
@@ -148,27 +148,28 @@ 
 
 
 // Start building a select statement.
-DEF_GO_RUNTIME(NEWSELECT, "runtime.newselect", P1(INT), R1(POINTER))
+DEF_GO_RUNTIME(NEWSELECT, "runtime.newselect", P1(INT32), R1(POINTER))
 
 // Add a default clause to a select statement.
-DEF_GO_RUNTIME(SELECTDEFAULT, "runtime.selectdefault", P2(POINTER, INT), R0())
+DEF_GO_RUNTIME(SELECTDEFAULT, "runtime.selectdefault",
+	       P2(POINTER, INT32), R0())
 
 // Add a send clause to a select statement.
 DEF_GO_RUNTIME(SELECTSEND, "runtime.selectsend",
-	       P4(POINTER, CHAN, POINTER, INT), R0())
+	       P4(POINTER, CHAN, POINTER, INT32), R0())
 
 // Add a receive clause to a select statement, for a clause which does
 // not check whether the channel is closed.
 DEF_GO_RUNTIME(SELECTRECV, "runtime.selectrecv",
-	       P4(POINTER, CHAN, POINTER, INT), R0())
+	       P4(POINTER, CHAN, POINTER, INT32), R0())
 
 // Add a receive clause to a select statement, for a clause which does
 // check whether the channel is closed.
 DEF_GO_RUNTIME(SELECTRECV2, "runtime.selectrecv2",
-	       P5(POINTER, CHAN, POINTER, BOOLPTR, INT), R0())
+	       P5(POINTER, CHAN, POINTER, BOOLPTR, INT32), R0())
 
 // Run a select, returning the index of the selected clause.
-DEF_GO_RUNTIME(SELECTGO, "runtime.selectgo", P1(POINTER), R1(INT))
+DEF_GO_RUNTIME(SELECTGO, "runtime.selectgo", P1(POINTER), R1(INT32))
 
 
 // Panic.
diff -r bf12a7f41b67 go/statements.cc
--- a/go/statements.cc	Sun Oct 07 21:29:09 2012 -0700
+++ b/go/statements.cc	Mon Oct 22 17:36:23 2012 -0700
@@ -4841,6 +4841,8 @@ 
   std::vector<std::vector<Bexpression*> > cases(count);
   std::vector<Bstatement*> clauses(count);
 
+  Type* int32_type = Type::lookup_integer_type("int32");
+
   int i = 0;
   for (Clauses::iterator p = this->clauses_.begin();
        p != this->clauses_.end();
@@ -4849,7 +4851,8 @@ 
       int index = p->index();
       mpz_t ival;
       mpz_init_set_ui(ival, index);
-      Expression* index_expr = Expression::make_integer(&ival, NULL, location);
+      Expression* index_expr = Expression::make_integer(&ival, int32_type,
+							location);
       mpz_clear(ival);
       cases[i].push_back(tree_to_expr(index_expr->get_tree(context)));
 
diff -r bf12a7f41b67 libgo/MERGE
--- a/libgo/MERGE	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/MERGE	Mon Oct 22 17:36:23 2012 -0700
@@ -1,4 +1,4 @@ 
-2d8bc3c94ecb
+291d9f1baf75
 
 The first line of this file holds the Mercurial revision number of the
 last merge done from the master library sources.
diff -r bf12a7f41b67 libgo/Makefile.am
--- a/libgo/Makefile.am	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/Makefile.am	Mon Oct 22 17:36:23 2012 -0700
@@ -230,6 +230,21 @@ 
 	exp/types.gox \
 	exp/utf8string.gox
 
+toolexeclibgoexphtmldir = $(toolexeclibgoexpdir)/html
+
+toolexeclibgoexphtml_DATA = \
+	exp/html/atom.gox
+
+toolexeclibgoexplocaledir = $(toolexeclibgoexpdir)/locale
+
+toolexeclibgoexplocale_DATA = \
+	exp/locale/collate.gox
+
+toolexeclibgoexplocalecollatedir = $(toolexeclibgoexplocaledir)/collate
+
+toolexeclibgoexplocalecollate_DATA = \
+	exp/locale/collate/build.gox
+
 toolexeclibgogodir = $(toolexeclibgodir)/go
 
 toolexeclibgogo_DATA = \
@@ -483,6 +498,7 @@ 
 	runtime/go-unwind.c \
 	runtime/chan.c \
 	runtime/cpuprof.c \
+	runtime/lfstack.c \
 	$(runtime_lock_files) \
 	runtime/mcache.c \
 	runtime/mcentral.c \
@@ -492,6 +508,8 @@ 
 	runtime/mgc0.c \
 	runtime/mheap.c \
 	runtime/msize.c \
+	runtime/panic.c \
+	runtime/parfor.c \
 	runtime/print.c \
 	runtime/proc.c \
 	runtime/runtime.c \
@@ -656,16 +674,16 @@ 
 else # !LIBGO_IS_RTEMS
 if LIBGO_IS_LINUX
 go_net_fd_os_file = go/net/fd_linux.go
-go_net_newpollserver_file = go/net/newpollserver.go
+go_net_newpollserver_file = go/net/newpollserver_unix.go
 else # !LIBGO_IS_LINUX && !LIBGO_IS_RTEMS
 if LIBGO_IS_NETBSD
 go_net_fd_os_file = go/net/fd_netbsd.go
-go_net_newpollserver_file = go/net/newpollserver.go
+go_net_newpollserver_file = go/net/newpollserver_unix.go
 else # !LIBGO_IS_NETBSD && !LIBGO_IS_LINUX && !LIBGO_IS_RTEMS
 # By default use select with pipes.  Most systems should have
 # something better.
 go_net_fd_os_file = go/net/fd_select.go
-go_net_newpollserver_file = go/net/newpollserver.go
+go_net_newpollserver_file = go/net/newpollserver_unix.go
 endif # !LIBGO_IS_NETBSD
 endif # !LIBGO_IS_LINUX
 endif # !LIBGO_IS_RTEMS
@@ -674,13 +692,13 @@ 
 go_net_cgo_file = go/net/cgo_linux.go
 go_net_sock_file = go/net/sock_linux.go
 go_net_sockopt_file = go/net/sockopt_linux.go
-go_net_sockoptip_file = go/net/sockoptip_linux.go
+go_net_sockoptip_file = go/net/sockoptip_linux.go go/net/sockoptip_posix.go
 else
 if LIBGO_IS_IRIX
 go_net_cgo_file = go/net/cgo_linux.go
 go_net_sock_file = go/net/sock_linux.go
 go_net_sockopt_file = go/net/sockopt_linux.go
-go_net_sockoptip_file = go/net/sockoptip_linux.go
+go_net_sockoptip_file = go/net/sockoptip_linux.go go/net/sockoptip_posix.go
 else
 if LIBGO_IS_SOLARIS
 go_net_cgo_file = go/net/cgo_linux.go
@@ -692,12 +710,19 @@ 
 go_net_cgo_file = go/net/cgo_bsd.go
 go_net_sock_file = go/net/sock_bsd.go
 go_net_sockopt_file = go/net/sockopt_bsd.go
-go_net_sockoptip_file = go/net/sockoptip_bsd.go go/net/sockoptip_freebsd.go
+go_net_sockoptip_file = go/net/sockoptip_bsd.go go/net/sockoptip_posix.go
+else
+if LIBGO_IS_NETBSD
+go_net_cgo_file = go/net/cgo_netbsd.go
+go_net_sock_file = go/net/sock_bsd.go
+go_net_sockopt_file = go/net/sockopt_bsd.go
+go_net_sockoptip_file = go/net/sockoptip_bsd.go go/net/sockoptip_posix.go
 else
 go_net_cgo_file = go/net/cgo_bsd.go
 go_net_sock_file = go/net/sock_bsd.go
 go_net_sockopt_file = go/net/sockopt_bsd.go
-go_net_sockoptip_file = go/net/sockoptip_bsd.go go/net/sockoptip_netbsd.go
+go_net_sockoptip_file = go/net/sockoptip_bsd.go go/net/sockoptip_posix.go
+endif
 endif
 endif
 endif
@@ -706,8 +731,12 @@ 
 if LIBGO_IS_LINUX
 go_net_sendfile_file = go/net/sendfile_linux.go
 else
+if LIBGO_IS_FREEBSD
+go_net_sendfile_file = go/net/sendfile_freebsd.go
+else
 go_net_sendfile_file = go/net/sendfile_stub.go
 endif
+endif
 
 if LIBGO_IS_LINUX
 go_net_interface_file = go/net/interface_linux.go
@@ -725,13 +754,12 @@ 
 	go/net/dial.go \
 	go/net/dnsclient.go \
 	go/net/dnsclient_unix.go \
-	go/net/dnsconfig.go \
+	go/net/dnsconfig_unix.go \
 	go/net/dnsmsg.go \
-	go/net/doc.go \
 	$(go_net_newpollserver_file) \
-	go/net/fd.go \
+	go/net/fd_unix.go \
 	$(go_net_fd_os_file) \
-	go/net/file.go \
+	go/net/file_unix.go \
 	go/net/hosts.go \
 	go/net/interface.go \
 	$(go_net_interface_file) \
@@ -740,6 +768,7 @@ 
 	go/net/iprawsock_posix.go \
 	go/net/ipsock.go \
 	go/net/ipsock_posix.go \
+	go/net/lookup.go \
 	go/net/lookup_unix.go \
 	go/net/mac.go \
 	go/net/net.go \
@@ -747,12 +776,12 @@ 
 	go/net/parse.go \
 	go/net/pipe.go \
 	go/net/port.go \
+	go/net/port_unix.go \
 	$(go_net_sendfile_file) \
-	go/net/sock.go \
+	go/net/sock_posix.go \
 	$(go_net_sock_file) \
-	go/net/sockopt.go \
+	go/net/sockopt_posix.go \
 	$(go_net_sockopt_file) \
-	go/net/sockoptip.go \
 	$(go_net_sockoptip_file) \
 	go/net/tcpsock.go \
 	go/net/tcpsock_posix.go \
@@ -831,6 +860,7 @@ 
 
 go_reflect_files = \
 	go/reflect/deepequal.go \
+	go/reflect/makefunc.go \
 	go/reflect/type.go \
 	go/reflect/value.go
 
@@ -882,12 +912,14 @@ 
 go_strings_files = \
 	go/strings/reader.go \
 	go/strings/replace.go \
+	go/strings/search.go \
 	go/strings/strings.go
 
 go_sync_files = \
 	go/sync/cond.go \
 	go/sync/mutex.go \
 	go/sync/once.go \
+	go/sync/race0.go \
 	go/sync/runtime.go \
 	go/sync/rwmutex.go \
 	go/sync/waitgroup.go
@@ -930,11 +962,28 @@ 
 	go/unicode/letter.go \
 	go/unicode/tables.go
 
+if LIBGO_IS_LINUX
+archive_tar_atim_file = go/archive/tar/stat_atim.go
+endif
+if LIBGO_IS_OPENBSD
+archive_tar_atim_file = go/archive/tar/stat_atim.go
+endif
+if LIBGO_IS_DARWIN
+archive_tar_atim_file = go/archive/tar/stat_atimespec.go
+endif
+if LIBGO_IS_FREEBSD
+archive_tar_atim_file = go/archive/tar/stat_atimespec.go
+endif
+if LIBGO_IS_NETBSD
+archive_tar_atim_file = go/archive/tar/stat_atimespec.go
+endif
 
 go_archive_tar_files = \
 	go/archive/tar/common.go \
 	go/archive/tar/reader.go \
-	go/archive/tar/writer.go
+	go/archive/tar/stat_unix.go \
+	go/archive/tar/writer.go \
+	$(archive_tar_atim_file)
 
 go_archive_zip_files = \
 	go/archive/zip/reader.go \
@@ -948,6 +997,7 @@ 
 	go/compress/bzip2/move_to_front.go
 
 go_compress_flate_files = \
+	go/compress/flate/copy.go \
 	go/compress/flate/deflate.go \
 	go/compress/flate/huffman_bit_writer.go \
 	go/compress/flate/huffman_code.go \
@@ -979,6 +1029,7 @@ 
 go_crypto_aes_files = \
 	go/crypto/aes/block.go \
 	go/crypto/aes/cipher.go \
+	go/crypto/aes/cipher_generic.go \
 	go/crypto/aes/const.go
 go_crypto_cipher_files = \
 	go/crypto/cipher/cbc.go \
@@ -1033,9 +1084,11 @@ 
 	go/crypto/tls/handshake_server.go \
 	go/crypto/tls/key_agreement.go \
 	go/crypto/tls/prf.go \
+	go/crypto/tls/ticket.go \
 	go/crypto/tls/tls.go
 go_crypto_x509_files = \
 	go/crypto/x509/cert_pool.go \
+	go/crypto/x509/pem_decrypt.go \
 	go/crypto/x509/pkcs1.go \
 	go/crypto/x509/pkcs8.go \
 	go/crypto/x509/root.go \
@@ -1130,8 +1183,26 @@ 
 	go/exp/html/parse.go \
 	go/exp/html/render.go \
 	go/exp/html/token.go
+go_exp_html_atom_files = \
+	go/exp/html/atom/atom.go \
+	go/exp/html/atom/table.go
 go_exp_inotify_files = \
 	go/exp/inotify/inotify_linux.go
+go_exp_locale_collate_files = \
+	go/exp/locale/collate/colelem.go \
+	go/exp/locale/collate/collate.go \
+	go/exp/locale/collate/contract.go \
+	go/exp/locale/collate/export.go \
+	go/exp/locale/collate/table.go \
+	go/exp/locale/collate/tables.go \
+	go/exp/locale/collate/trie.go
+go_exp_locale_collate_build_files = \
+	go/exp/locale/collate/build/builder.go \
+	go/exp/locale/collate/build/colelem.go \
+	go/exp/locale/collate/build/contract.go \
+	go/exp/locale/collate/build/order.go \
+	go/exp/locale/collate/build/table.go \
+	go/exp/locale/collate/build/trie.go
 go_exp_norm_files = \
 	go/exp/norm/composition.go \
 	go/exp/norm/forminfo.go \
@@ -1161,6 +1232,7 @@ 
 
 go_go_ast_files = \
 	go/go/ast/ast.go \
+	go/go/ast/commentmap.go \
 	go/go/ast/filter.go \
 	go/go/ast/import.go \
 	go/go/ast/print.go \
@@ -1170,6 +1242,7 @@ 
 go_go_build_files = \
 	go/go/build/build.go \
 	go/go/build/doc.go \
+	go/go/build/read.go \
 	syslist.go
 go_go_doc_files = \
 	go/go/doc/comment.go \
@@ -1235,6 +1308,7 @@ 
 	go/image/jpeg/writer.go
 
 go_image_png_files = \
+	go/image/png/paeth.go \
 	go/image/png/reader.go \
 	go/image/png/writer.go
 
@@ -1243,6 +1317,7 @@ 
 	go/index/suffixarray/suffixarray.go
 
 go_io_ioutil_files = \
+	go/io/ioutil/blackhole.go \
 	go/io/ioutil/ioutil.go \
 	go/io/ioutil/tempfile.go
 
@@ -1358,6 +1433,7 @@ 
 
 go_regexp_syntax_files = \
 	go/regexp/syntax/compile.go \
+	go/regexp/syntax/doc.go \
 	go/regexp/syntax/parse.go \
 	go/regexp/syntax/perl_groups.go \
 	go/regexp/syntax/prog.go \
@@ -1544,6 +1620,7 @@ 
 	go/syscall/syscall_errno.go \
 	go/syscall/libcall_support.go \
 	go/syscall/libcall_posix.go \
+	go/syscall/race0.go \
 	go/syscall/socket.go \
 	go/syscall/sockcmsg_unix.go \
 	go/syscall/str.go \
@@ -1714,6 +1791,9 @@ 
 	encoding/xml.lo \
 	exp/ebnf.lo \
 	exp/html.lo \
+	exp/html/atom.lo \
+	exp/locale/collate.lo \
+	exp/locale/collate/build.lo \
 	exp/norm.lo \
 	exp/proxy.lo \
 	exp/terminal.lo \
@@ -2562,6 +2642,33 @@ 
 	@$(CHECK)
 .PHONY: exp/html/check
 
+@go_include@ exp/html/atom.lo.dep
+exp/html/atom.lo.dep: $(go_exp_html_atom_files)
+	$(BUILDDEPS)
+exp/html/atom.lo: $(go_exp_html_atom_files)
+	$(BUILDPACKAGE)
+exp/html/atom/check: $(CHECK_DEPS)
+	@$(CHECK)
+.PHONY: exp/html/atom/check
+
+@go_include@ exp/locale/collate.lo.dep
+exp/locale/collate.lo.dep: $(go_exp_locale_collate_files)
+	$(BUILDDEPS)
+exp/locale/collate.lo: $(go_exp_locale_collate_files)
+	$(BUILDPACKAGE)
+exp/locale/collate/check: $(CHECK_DEPS)
+	@$(CHECK)
+.PHONY: exp/locale/collate/check
+
+@go_include@ exp/locale/collate/build.lo.dep
+exp/locale/collate/build.lo.dep: $(go_exp_locale_collate_build_files)
+	$(BUILDDEPS)
+exp/locale/collate/build.lo: $(go_exp_locale_collate_build_files)
+	$(BUILDPACKAGE)
+exp/locale/collate/build/check: $(CHECK_DEPS)
+	@$(CHECK)
+.PHONY: exp/locale/collate/build/check
+
 @go_include@ exp/norm.lo.dep
 exp/norm.lo.dep: $(go_exp_norm_files)
 	$(BUILDDEPS)
@@ -3142,6 +3249,9 @@ 
 syscall/wait.lo: go/syscall/wait.c
 	@$(MKDIR_P) syscall
 	$(LTCOMPILE) -c -o $@ $<
+syscall/check: $(CHECK_DEPS)
+	@$(CHECK)
+.PHONY: syscall/check
 
 # How to build a .gox file from a .lo file.
 BUILDGOX = \
@@ -3310,8 +3420,14 @@ 
 	$(BUILDGOX)
 exp/html.gox: exp/html.lo
 	$(BUILDGOX)
+exp/html/atom.gox: exp/html/atom.lo
+	$(BUILDGOX)
 exp/inotify.gox: exp/inotify.lo
 	$(BUILDGOX)
+exp/locale/collate.gox: exp/locale/collate.lo
+	$(BUILDGOX)
+exp/locale/collate/build.gox: exp/locale/collate/build.lo
+	$(BUILDGOX)
 exp/norm.gox: exp/norm.lo
 	$(BUILDGOX)
 exp/proxy.gox: exp/proxy.lo
@@ -3484,6 +3600,7 @@ 
 	strconv/check \
 	strings/check \
 	sync/check \
+	syscall/check \
 	time/check \
 	unicode/check \
 	archive/tar/check \
@@ -3532,10 +3649,14 @@ 
 	encoding/xml/check \
 	exp/ebnf/check \
 	exp/html/check \
+	exp/html/atom/check \
 	$(exp_inotify_check) \
+	exp/locale/collate/check \
+	exp/locale/collate/build/check \
 	exp/norm/check \
 	exp/proxy/check \
 	exp/terminal/check \
+	exp/types/check \
 	exp/utf8string/check \
 	html/template/check \
 	go/ast/check \
diff -r bf12a7f41b67 libgo/configure.ac
--- a/libgo/configure.ac	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/configure.ac	Mon Oct 22 17:36:23 2012 -0700
@@ -129,6 +129,7 @@ 
 is_irix=no
 is_linux=no
 is_netbsd=no
+is_openbsd=no
 is_rtems=no
 is_solaris=no
 GOOS=unknown
@@ -138,6 +139,7 @@ 
   *-*-irix6*)    is_irix=yes;    GOOS=irix ;;
   *-*-linux*)    is_linux=yes;   GOOS=linux ;;
   *-*-netbsd*)	 is_netbsd=yes;  GOOS=netbsd ;;
+  *-*-openbsd*)  is_openbsd=yes; GOOS=openbsd ;;
   *-*-rtems*)    is_rtems=yes;   GOOS=rtems ;;
   *-*-solaris2*) is_solaris=yes; GOOS=solaris ;;
 esac
@@ -146,6 +148,7 @@ 
 AM_CONDITIONAL(LIBGO_IS_IRIX, test $is_irix = yes)
 AM_CONDITIONAL(LIBGO_IS_LINUX, test $is_linux = yes)
 AM_CONDITIONAL(LIBGO_IS_NETBSD, test $is_netbsd = yes)
+AM_CONDITIONAL(LIBGO_IS_OPENBSD, test $is_openbsd = yes)
 AM_CONDITIONAL(LIBGO_IS_RTEMS, test $is_rtems = yes)
 AM_CONDITIONAL(LIBGO_IS_SOLARIS, test $is_solaris = yes)
 AC_SUBST(GOOS)
diff -r bf12a7f41b67 libgo/runtime/chan.c
--- a/libgo/runtime/chan.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/chan.c	Mon Oct 22 17:36:23 2012 -0700
@@ -4,8 +4,9 @@ 
 
 #include "runtime.h"
 #include "arch.h"
+#include "go-type.h"
+#include "race.h"
 #include "malloc.h"
-#include "go-type.h"
 
 #define	NOSELGEN	1
 
@@ -24,6 +25,7 @@ 
 	G*	g;		// g and selgen constitute
 	uint32	selgen;		// a weak pointer to g
 	SudoG*	link;
+	int64	releasetime;
 	byte*	elem;		// data element
 };
 
@@ -35,13 +37,13 @@ 
 
 struct	Hchan
 {
-	uint32	qcount;			// total data in the q
-	uint32	dataqsiz;		// size of the circular q
+	uintgo	qcount;			// total data in the q
+	uintgo	dataqsiz;		// size of the circular q
 	uint16	elemsize;
 	bool	closed;
 	uint8	elemalign;
-	uint32	sendx;			// send index
-	uint32	recvx;			// receive index
+	uintgo	sendx;			// send index
+	uintgo	recvx;			// receive index
 	WaitQ	recvq;			// list of recv waiters
 	WaitQ	sendq;			// list of send waiters
 	Lock;
@@ -80,17 +82,22 @@ 
 static	void	dequeueg(WaitQ*);
 static	SudoG*	dequeue(WaitQ*);
 static	void	enqueue(WaitQ*, SudoG*);
+static	void	racesync(Hchan*, SudoG*);
 
 Hchan*
 runtime_makechan_c(ChanType *t, int64 hint)
 {
 	Hchan *c;
-	int32 n;
+	uintptr n;
 	const Type *elem;
 
 	elem = t->__element_type;
 
-	if(hint < 0 || (int32)hint != hint || (elem->__size > 0 && (uintptr)hint > MaxMem / elem->__size))
+	// compiler checks this but be safe.
+	if(elem->__size >= (1<<16))
+		runtime_throw("makechan: invalid channel element type");
+
+	if(hint < 0 || (intgo)hint != hint || (elem->__size > 0 && (uintptr)hint > MaxMem / elem->__size))
 		runtime_panicstring("makechan: size out of range");
 
 	n = sizeof(*c);
@@ -102,19 +109,19 @@ 
 	c->dataqsiz = hint;
 
 	if(debug)
-		runtime_printf("makechan: chan=%p; elemsize=%D; elemalign=%d; dataqsiz=%d\n",
-			c, (int64)elem->__size, elem->__align, c->dataqsiz);
+		runtime_printf("makechan: chan=%p; elemsize=%D; elemalign=%d; dataqsiz=%D\n",
+			c, (int64)elem->__size, elem->__align, (int64)c->dataqsiz);
 
 	return c;
 }
 
 // For reflect
-//	func makechan(typ *ChanType, size uint32) (chan)
-uintptr reflect_makechan(ChanType *, uint32)
+//	func makechan(typ *ChanType, size uint64) (chan)
+uintptr reflect_makechan(ChanType *, uint64)
   asm ("reflect.makechan");
 
 uintptr
-reflect_makechan(ChanType *t, uint32 size)
+reflect_makechan(ChanType *t, uint64 size)
 {
 	void *ret;
 	Hchan *c;
@@ -153,11 +160,12 @@ 
  * the operation; we'll see that it's now closed.
  */
 void
-runtime_chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
+runtime_chansend(ChanType *t, Hchan *c, byte *ep, bool *pres, void *pc)
 {
 	SudoG *sg;
 	SudoG mysg;
 	G* gp;
+	int64 t0;
 	G* g;
 
 	g = runtime_g();
@@ -168,9 +176,7 @@ 
 			*pres = false;
 			return;
 		}
-		g->status = Gwaiting;
-		g->waitreason = "chan send (nil chan)";
-		runtime_gosched();
+		runtime_park(nil, nil, "chan send (nil chan)");
 		return;  // not reached
 	}
 
@@ -181,7 +187,17 @@ 
 		runtime_printf("chansend: chan=%p\n", c);
 	}
 
+	t0 = 0;
+	mysg.releasetime = 0;
+	if(runtime_blockprofilerate > 0) {
+		t0 = runtime_cputicks();
+		mysg.releasetime = -1;
+	}
+
 	runtime_lock(c);
+	// TODO(dvyukov): add similar instrumentation to select.
+	if(raceenabled)
+		runtime_racereadpc(c, pc);
 	if(c->closed)
 		goto closed;
 
@@ -190,12 +206,16 @@ 
 
 	sg = dequeue(&c->recvq);
 	if(sg != nil) {
+		if(raceenabled)
+			racesync(c, sg);
 		runtime_unlock(c);
 
 		gp = sg->g;
 		gp->param = sg;
 		if(sg->elem != nil)
 			runtime_memmove(sg->elem, ep, c->elemsize);
+		if(sg->releasetime)
+			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
 
 		if(pres != nil)
@@ -213,11 +233,8 @@ 
 	mysg.g = g;
 	mysg.selgen = NOSELGEN;
 	g->param = nil;
-	g->status = Gwaiting;
-	g->waitreason = "chan send";
 	enqueue(&c->sendq, &mysg);
-	runtime_unlock(c);
-	runtime_gosched();
+	runtime_park(runtime_unlock, c, "chan send");
 
 	if(g->param == nil) {
 		runtime_lock(c);
@@ -226,6 +243,9 @@ 
 		goto closed;
 	}
 
+	if(mysg.releasetime > 0)
+		runtime_blockevent(mysg.releasetime - t0, 2);
+
 	return;
 
 asynch:
@@ -241,15 +261,16 @@ 
 		mysg.g = g;
 		mysg.elem = nil;
 		mysg.selgen = NOSELGEN;
-		g->status = Gwaiting;
-		g->waitreason = "chan send";
 		enqueue(&c->sendq, &mysg);
-		runtime_unlock(c);
-		runtime_gosched();
+		runtime_park(runtime_unlock, c, "chan send");
 
 		runtime_lock(c);
 		goto asynch;
 	}
+
+	if(raceenabled)
+		runtime_racerelease(chanbuf(c, c->sendx));
+
 	runtime_memmove(chanbuf(c, c->sendx), ep, c->elemsize);
 	if(++c->sendx == c->dataqsiz)
 		c->sendx = 0;
@@ -259,11 +280,15 @@ 
 	if(sg != nil) {
 		gp = sg->g;
 		runtime_unlock(c);
+		if(sg->releasetime)
+			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
 	} else
 		runtime_unlock(c);
 	if(pres != nil)
 		*pres = true;
+	if(mysg.releasetime > 0)
+		runtime_blockevent(mysg.releasetime - t0, 2);
 	return;
 
 closed:
@@ -278,6 +303,7 @@ 
 	SudoG *sg;
 	SudoG mysg;
 	G *gp;
+	int64 t0;
 	G *g;
 
 	if(runtime_gcwaiting)
@@ -294,12 +320,17 @@ 
 			*selected = false;
 			return;
 		}
-		g->status = Gwaiting;
-		g->waitreason = "chan receive (nil chan)";
-		runtime_gosched();
+		runtime_park(nil, nil, "chan receive (nil chan)");
 		return;  // not reached
 	}
 
+	t0 = 0;
+	mysg.releasetime = 0;
+	if(runtime_blockprofilerate > 0) {
+		t0 = runtime_cputicks();
+		mysg.releasetime = -1;
+	}
+
 	runtime_lock(c);
 	if(c->dataqsiz > 0)
 		goto asynch;
@@ -309,12 +340,16 @@ 
 
 	sg = dequeue(&c->sendq);
 	if(sg != nil) {
+		if(raceenabled)
+			racesync(c, sg);
 		runtime_unlock(c);
 
 		if(ep != nil)
 			runtime_memmove(ep, sg->elem, c->elemsize);
 		gp = sg->g;
 		gp->param = sg;
+		if(sg->releasetime)
+			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
 
 		if(selected != nil)
@@ -334,11 +369,8 @@ 
 	mysg.g = g;
 	mysg.selgen = NOSELGEN;
 	g->param = nil;
-	g->status = Gwaiting;
-	g->waitreason = "chan receive";
 	enqueue(&c->recvq, &mysg);
-	runtime_unlock(c);
-	runtime_gosched();
+	runtime_park(runtime_unlock, c, "chan receive");
 
 	if(g->param == nil) {
 		runtime_lock(c);
@@ -349,6 +381,8 @@ 
 
 	if(received != nil)
 		*received = true;
+	if(mysg.releasetime > 0)
+		runtime_blockevent(mysg.releasetime - t0, 2);
 	return;
 
 asynch:
@@ -366,15 +400,16 @@ 
 		mysg.g = g;
 		mysg.elem = nil;
 		mysg.selgen = NOSELGEN;
-		g->status = Gwaiting;
-		g->waitreason = "chan receive";
 		enqueue(&c->recvq, &mysg);
-		runtime_unlock(c);
-		runtime_gosched();
+		runtime_park(runtime_unlock, c, "chan receive");
 
 		runtime_lock(c);
 		goto asynch;
 	}
+
+	if(raceenabled)
+		runtime_raceacquire(chanbuf(c, c->recvx));
+
 	if(ep != nil)
 		runtime_memmove(ep, chanbuf(c, c->recvx), c->elemsize);
 	runtime_memclr(chanbuf(c, c->recvx), c->elemsize);
@@ -386,6 +421,8 @@ 
 	if(sg != nil) {
 		gp = sg->g;
 		runtime_unlock(c);
+		if(sg->releasetime)
+			sg->releasetime = runtime_cputicks();
 		runtime_ready(gp);
 	} else
 		runtime_unlock(c);
@@ -394,6 +431,8 @@ 
 		*selected = true;
 	if(received != nil)
 		*received = true;
+	if(mysg.releasetime > 0)
+		runtime_blockevent(mysg.releasetime - t0, 2);
 	return;
 
 closed:
@@ -403,7 +442,11 @@ 
 		*selected = true;
 	if(received != nil)
 		*received = false;
+	if(raceenabled)
+		runtime_raceacquire(c);
 	runtime_unlock(c);
+	if(mysg.releasetime > 0)
+		runtime_blockevent(mysg.releasetime - t0, 2);
 }
 
 // The compiler generates a call to __go_send_small to send a value 8
@@ -424,7 +467,7 @@ 
 #else
 	p = u.b + sizeof(uint64) - t->__element_type->__size;
 #endif
-	runtime_chansend(t, c, p, nil);
+	runtime_chansend(t, c, p, nil, runtime_getcallerpc(&t));
 }
 
 // The compiler generates a call to __go_send_big to send a value
@@ -432,7 +475,7 @@ 
 void
 __go_send_big(ChanType *t, Hchan* c, byte* p)
 {
-	runtime_chansend(t, c, p, nil);
+	runtime_chansend(t, c, p, nil, runtime_getcallerpc(&t));
 }
 
 // The compiler generates a call to __go_receive_small to receive a
@@ -500,7 +543,7 @@ 
 {
 	bool res;
 
-	runtime_chansend(t, c, p, &res);
+	runtime_chansend(t, c, p, &res, runtime_getcallerpc(&t));
 	return res;
 }
 
@@ -590,7 +633,7 @@ 
 		vp = (byte*)&val;
 	else
 		vp = (byte*)val;
-	runtime_chansend(t, c, vp, sp);
+	runtime_chansend(t, c, vp, sp, runtime_getcallerpc(&t));
 	return selected;
 }
 
@@ -643,10 +686,10 @@ 
 
 // newselect(size uint32) (sel *byte);
 
-void* runtime_newselect(int) __asm__("runtime.newselect");
+void* runtime_newselect(int32) __asm__("runtime.newselect");
 
 void*
-runtime_newselect(int size)
+runtime_newselect(int32 size)
 {
 	Select *sel;
 
@@ -688,11 +731,11 @@ 
 
 // selectsend(sel *byte, hchan *chan any, elem *any) (selected bool);
 
-void runtime_selectsend(Select *, Hchan *, void *, int)
+void runtime_selectsend(Select *, Hchan *, void *, int32)
   __asm__("runtime.selectsend");
 
 void
-runtime_selectsend(Select *sel, Hchan *c, void *elem, int index)
+runtime_selectsend(Select *sel, Hchan *c, void *elem, int32 index)
 {
 	// nil cases do not compete
 	if(c == nil)
@@ -728,11 +771,11 @@ 
 
 // selectrecv(sel *byte, hchan *chan any, elem *any) (selected bool);
 
-void runtime_selectrecv(Select *, Hchan *, void *, int)
+void runtime_selectrecv(Select *, Hchan *, void *, int32)
   __asm__("runtime.selectrecv");
 
 void
-runtime_selectrecv(Select *sel, Hchan *c, void *elem, int index)
+runtime_selectrecv(Select *sel, Hchan *c, void *elem, int32 index)
 {
 	// nil cases do not compete
 	if(c == nil)
@@ -743,11 +786,11 @@ 
 
 // selectrecv2(sel *byte, hchan *chan any, elem *any, received *bool) (selected bool);
 
-void runtime_selectrecv2(Select *, Hchan *, void *, bool *, int)
+void runtime_selectrecv2(Select *, Hchan *, void *, bool *, int32)
   __asm__("runtime.selectrecv2");
 
 void
-runtime_selectrecv2(Select *sel, Hchan *c, void *elem, bool *received, int index)
+runtime_selectrecv2(Select *sel, Hchan *c, void *elem, bool *received, int32 index)
 {
 	// nil cases do not compete
 	if(c == nil)
@@ -784,16 +827,16 @@ 
 
 // selectdefault(sel *byte) (selected bool);
 
-void runtime_selectdefault(Select *, int) __asm__("runtime.selectdefault");
+void runtime_selectdefault(Select *, int32) __asm__("runtime.selectdefault");
 
 void
-runtime_selectdefault(Select *sel, int index)
+runtime_selectdefault(Select *sel, int32 index)
 {
 	selectdefault(sel, index);
 }
 
 static void
-selectdefault(Select *sel, int index)
+selectdefault(Select *sel, int32 index)
 {
 	int32 i;
 	Scase *cas;
@@ -848,12 +891,7 @@ 
 void
 runtime_block(void)
 {
-	G *g;
-
-	g = runtime_g();
-	g->status = Gwaiting;	// forever
-	g->waitreason = "select (no cases)";
-	runtime_gosched();
+	runtime_park(nil, nil, "select (no cases)");	// forever
 }
 
 static int selectgo(Select**);
@@ -985,10 +1023,7 @@ 
 	}
 
 	g->param = nil;
-	g->status = Gwaiting;
-	g->waitreason = "select";
-	selunlock(sel);
-	runtime_gosched();
+	runtime_park((void(*)(Lock*))selunlock, (Lock*)sel, "select");
 
 	sellock(sel);
 	sg = g->param;
@@ -1029,6 +1064,8 @@ 
 
 asyncrecv:
 	// can receive from buffer
+	if(raceenabled)
+		runtime_raceacquire(chanbuf(c, c->recvx));
 	if(cas->receivedp != nil)
 		*cas->receivedp = true;
 	if(cas->sg.elem != nil)
@@ -1049,6 +1086,8 @@ 
 
 asyncsend:
 	// can send to buffer
+	if(raceenabled)
+		runtime_racerelease(chanbuf(c, c->sendx));
 	runtime_memmove(chanbuf(c, c->sendx), cas->sg.elem, c->elemsize);
 	if(++c->sendx == c->dataqsiz)
 		c->sendx = 0;
@@ -1065,6 +1104,8 @@ 
 
 syncrecv:
 	// can receive from sleeping sender (sg)
+	if(raceenabled)
+		racesync(c, sg);
 	selunlock(sel);
 	if(debug)
 		runtime_printf("syncrecv: sel=%p c=%p o=%d\n", sel, c, o);
@@ -1084,10 +1125,14 @@ 
 		*cas->receivedp = false;
 	if(cas->sg.elem != nil)
 		runtime_memclr(cas->sg.elem, c->elemsize);
+	if(raceenabled)
+		runtime_raceacquire(c);
 	goto retc;
 
 syncsend:
 	// can send to sleeping receiver (sg)
+	if(raceenabled)
+		racesync(c, sg);
 	selunlock(sel);
 	if(debug)
 		runtime_printf("syncsend: sel=%p c=%p o=%d\n", sel, c, o);
@@ -1110,6 +1155,102 @@ 
 	return 0;  // not reached
 }
 
+// This struct must match ../reflect/value.go:/runtimeSelect.
+typedef struct runtimeSelect runtimeSelect;
+struct runtimeSelect
+{
+	uintptr dir;
+	ChanType *typ;
+	Hchan *ch;
+	uintptr val;
+};
+
+// This enum must match ../reflect/value.go:/SelectDir.
+enum SelectDir {
+	SelectSend = 1,
+	SelectRecv,
+	SelectDefault,
+};
+
+struct rselect_ret {
+	intgo chosen;
+	uintptr word;
+	bool recvOK;
+};
+
+// func rselect(cases []runtimeSelect) (chosen int, word uintptr, recvOK bool)
+
+struct rselect_ret reflect_rselect(Slice)
+     asm("reflect.rselect");
+
+struct rselect_ret
+reflect_rselect(Slice cases)
+{
+	struct rselect_ret ret;
+	int32 i;
+	Select *sel;
+	runtimeSelect* rcase, *rc;
+	void *elem;
+	void *recvptr;
+	uintptr maxsize;
+	bool onlyptr;
+
+	ret.chosen = -1;
+	ret.word = 0;
+	ret.recvOK = false;
+
+	maxsize = 0;
+	onlyptr = true;
+	rcase = (runtimeSelect*)cases.__values;
+	for(i=0; i<cases.__count; i++) {
+		rc = &rcase[i];
+		if(rc->dir == SelectRecv && rc->ch != nil) {
+			if(maxsize < rc->typ->__element_type->__size)
+				maxsize = rc->typ->__element_type->__size;
+			if(!__go_is_pointer_type(rc->typ->__element_type))
+				onlyptr = false;
+		}
+	}
+
+	recvptr = nil;
+	if(!onlyptr)
+		recvptr = runtime_mal(maxsize);
+
+	newselect(cases.__count, &sel);
+	for(i=0; i<cases.__count; i++) {
+		rc = &rcase[i];
+		switch(rc->dir) {
+		case SelectDefault:
+			selectdefault(sel, i);
+			break;
+		case SelectSend:
+			if(rc->ch == nil)
+				break;
+			if(!__go_is_pointer_type(rc->typ->__element_type))
+				elem = (void*)rc->val;
+			else
+				elem = (void*)&rc->val;
+			selectsend(sel, rc->ch, i, elem);
+			break;
+		case SelectRecv:
+			if(rc->ch == nil)
+				break;
+			if(!__go_is_pointer_type(rc->typ->__element_type))
+				elem = recvptr;
+			else
+				elem = &ret.word;
+			selectrecv(sel, rc->ch, i, elem, &ret.recvOK);
+			break;
+		}
+	}
+
+	ret.chosen = (intgo)(uintptr)selectgo(&sel);
+	if(rcase[ret.chosen].dir == SelectRecv && !__go_is_pointer_type(rcase[ret.chosen].typ->__element_type))
+		ret.word = (uintptr)recvptr;
+
+	return ret;
+}
+
 // closechan(sel *byte);
 void
 runtime_closechan(Hchan *c)
@@ -1129,6 +1270,11 @@ 
 		runtime_panicstring("close of closed channel");
 	}
 
+	if(raceenabled) {
+		runtime_racewritepc(c, runtime_getcallerpc(&c));
+		runtime_racerelease(c);
+	}
+
 	c->closed = true;
 
 	// release all readers
@@ -1172,15 +1318,15 @@ 
 }
 
 // For reflect
-//	func chanlen(c chan) (len int32)
+//	func chanlen(c chan) (len int)
 
-int32 reflect_chanlen(uintptr) __asm__("reflect.chanlen");
+intgo reflect_chanlen(uintptr) __asm__("reflect.chanlen");
 
-int32
+intgo
 reflect_chanlen(uintptr ca)
 {
 	Hchan *c;
-	int32 len;
+	intgo len;
 
 	c = (Hchan*)ca;
 	if(c == nil)
@@ -1190,22 +1336,22 @@ 
 	return len;
 }
 
-int
+intgo
 __go_chan_len(Hchan *c)
 {
 	return reflect_chanlen((uintptr)c);
 }
 
 // For reflect
-//	func chancap(c chan) (cap int32)
+//	func chancap(c chan) (cap intgo)
 
-int32 reflect_chancap(uintptr) __asm__("reflect.chancap");
+intgo reflect_chancap(uintptr) __asm__("reflect.chancap");
 
-int32
+intgo
 reflect_chancap(uintptr ca)
 {
 	Hchan *c;
-	int32 cap;
+	intgo cap;
 
 	c = (Hchan*)ca;
 	if(c == nil)
@@ -1215,7 +1361,7 @@ 
 	return cap;
 }
 
-int
+intgo
 __go_chan_cap(Hchan *c)
 {
 	return reflect_chancap((uintptr)c);
@@ -1273,3 +1419,12 @@ 
 	q->last->link = sgp;
 	q->last = sgp;
 }
+
+static void
+racesync(Hchan *c, SudoG *sg)
+{
+	runtime_racerelease(chanbuf(c, 0));
+	runtime_raceacquireg(sg->g, chanbuf(c, 0));
+	runtime_racereleaseg(sg->g, chanbuf(c, 0));
+	runtime_raceacquire(chanbuf(c, 0));
+}
diff -r bf12a7f41b67 libgo/runtime/cpuprof.c
--- a/libgo/runtime/cpuprof.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/cpuprof.c	Mon Oct 22 17:36:23 2012 -0700
@@ -130,7 +130,7 @@ 
 // SetCPUProfileRate sets the CPU profiling rate.
 // The user documentation is in debug.go.
 void
-runtime_SetCPUProfileRate(int32 hz)
+runtime_SetCPUProfileRate(intgo hz)
 {
 	uintptr *p;
 	uintptr n;
diff -r bf12a7f41b67 libgo/runtime/go-rune.c
--- a/libgo/runtime/go-rune.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/go-rune.c	Mon Oct 22 17:36:23 2012 -0700
@@ -15,7 +15,7 @@ 
 int
 __go_get_rune (const unsigned char *str, size_t len, int *rune)
 {
-  int c, c1, c2, c3;
+  int c, c1, c2, c3, l;
 
   /* Default to the "replacement character".  */
   *rune = 0xfffd;
@@ -37,8 +37,10 @@ 
   if ((c & 0xe0) == 0xc0
       && (c1 & 0xc0) == 0x80)
     {
-      *rune = (((c & 0x1f) << 6)
-	       + (c1 & 0x3f));
+      l = (((c & 0x1f) << 6) + (c1 & 0x3f));
+      if (l <= 0x7f)
+	return 1;
+      *rune = l;
       return 2;
     }
 
@@ -50,17 +52,21 @@ 
       && (c1 & 0xc0) == 0x80
       && (c2 & 0xc0) == 0x80)
     {
-      *rune = (((c & 0xf) << 12)
-	       + ((c1 & 0x3f) << 6)
-	       + (c2 & 0x3f));
+      l = (((c & 0xf) << 12)
+	   + ((c1 & 0x3f) << 6)
+	   + (c2 & 0x3f));
 
-      if (*rune >= 0xd800 && *rune < 0xe000)
+      if (l <= 0x7ff)
+	return 1;
+
+      if (l >= 0xd800 && l < 0xe000)
 	{
 	  /* Invalid surrogate half; return replace character.  */
-	  *rune = 0xfffd;
 	  return 1;
 	}
 
+      *rune = l;
+
       return 3;
     }
 
@@ -73,10 +79,15 @@ 
       && (c2 & 0xc0) == 0x80
       && (c3 & 0xc0) == 0x80)
     {
-      *rune = (((c & 0x7) << 18)
-	       + ((c1 & 0x3f) << 12)
-	       + ((c2 & 0x3f) << 6)
-	       + (c3 & 0x3f));
+      l = (((c & 0x7) << 18)
+	   + ((c1 & 0x3f) << 12)
+	   + ((c2 & 0x3f) << 6)
+	   + (c3 & 0x3f));
+
+      if (l <= 0xffff || l > 0x10ffff)
+	return 1;
+
+      *rune = l;
       return 4;
     }
 
diff -r bf12a7f41b67 libgo/runtime/go-signal.c
--- a/libgo/runtime/go-signal.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/go-signal.c	Mon Oct 22 17:36:23 2012 -0700
@@ -138,6 +138,19 @@ 
 #undef P
 #undef D
 
+
+static int8 badsignal[] = "runtime: signal received on thread not created by Go.\n";
+
+static void
+runtime_badsignal(int32 sig)
+{
+	if (sig == SIGPROF) {
+		return;  // Ignore SIGPROFs intended for a non-Go thread.
+	}
+	runtime_write(2, badsignal, sizeof badsignal - 1);
+	runtime_exit(1);
+}
+
 /* Handle a signal, for cases where we don't panic.  We can split the
    stack here.  */
 
@@ -146,6 +159,12 @@ 
 {
   int i;
 
+  if (runtime_m () == NULL)
+    {
+      runtime_badsignal (sig);
+      return;
+    }
+
 #ifdef SIGPROF
   if (sig == SIGPROF)
     {
diff -r bf12a7f41b67 libgo/runtime/go-trampoline.c
--- a/libgo/runtime/go-trampoline.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/go-trampoline.c	Mon Oct 22 17:36:23 2012 -0700
@@ -106,8 +106,8 @@ 
    no other references to it.  */
 
 void
-runtime_trampoline_scan (void (*scan) (byte *, int64))
+runtime_trampoline_scan (void (*addroot) (byte *, uintptr))
 {
   if (trampoline_page != NULL)
-    scan ((byte *) &trampoline_page, sizeof trampoline_page);
+    addroot ((byte *) &trampoline_page, sizeof trampoline_page);
 }
diff -r bf12a7f41b67 libgo/runtime/lfstack.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libgo/runtime/lfstack.c	Mon Oct 22 17:36:23 2012 -0700
@@ -0,0 +1,66 @@ 
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lock-free stack.
+
+#include "runtime.h"
+#include "arch.h"
+
+#if __SIZEOF_POINTER__ == 8
+// Amd64 uses 48-bit virtual addresses, 47-th bit is used as kernel/user flag.
+// So we use 17msb of pointers as ABA counter.
+# define PTR_BITS 47
+#else
+# define PTR_BITS 32
+#endif
+#define PTR_MASK ((1ull<<PTR_BITS)-1)
+
+void
+runtime_lfstackpush(uint64 *head, LFNode *node)
+{
+	uint64 old, new;
+
+	if((uintptr)node != ((uintptr)node&PTR_MASK)) {
+		runtime_printf("p=%p\n", node);
+		runtime_throw("runtime_lfstackpush: invalid pointer");
+	}
+
+	node->pushcnt++;
+	new = (uint64)(uintptr)node|(((uint64)node->pushcnt)<<PTR_BITS);
+	old = runtime_atomicload64(head);
+	for(;;) {
+		node->next = (LFNode*)(uintptr)(old&PTR_MASK);
+		if(runtime_cas64(head, &old, new))
+			break;
+	}
+}
+
+LFNode*
+runtime_lfstackpop(uint64 *head)
+{
+	LFNode *node, *node2;
+	uint64 old, new;
+
+	old = runtime_atomicload64(head);
+	for(;;) {
+		if(old == 0)
+			return nil;
+		node = (LFNode*)(uintptr)(old&PTR_MASK);
+		node2 = runtime_atomicloadp(&node->next);
+		new = 0;
+		if(node2 != nil)
+			new = (uint64)(uintptr)node2|(((uint64)node2->pushcnt)<<PTR_BITS);
+		if(runtime_cas64(head, &old, new))
+			return node;
+	}
+}
+
+LFNode* runtime_lfstackpop2(uint64*)
+  asm("runtime.lfstackpop2");
+
+LFNode*
+runtime_lfstackpop2(uint64 *head)
+{
+	return runtime_lfstackpop(head);
+}
diff -r bf12a7f41b67 libgo/runtime/malloc.goc
--- a/libgo/runtime/malloc.goc	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/malloc.goc	Mon Oct 22 17:36:23 2012 -0700
@@ -17,12 +17,13 @@ 
 #include "go-string.h"
 #include "interface.h"
 #include "go-type.h"
+#include "race.h"
 
 MHeap runtime_mheap;
 
 extern MStats mstats;	// defined in extern.go
 
-extern volatile int32 runtime_MemProfileRate
+extern volatile intgo runtime_MemProfileRate
   __asm__ ("runtime.MemProfileRate");
 
 // Allocate an object of at least size bytes.
@@ -33,7 +34,8 @@ 
 {
 	M *m;
 	G *g;
-	int32 sizeclass, rate;
+	int32 sizeclass;
+	intgo rate;
 	MCache *c;
 	uintptr npages;
 	MSpan *s;
@@ -53,6 +55,9 @@ 
 	if(size == 0)
 		size = 1;
 
+	if(DebugTypeAtBlockEnd)
+		size += sizeof(uintptr);
+
 	c = m->mcache;
 	c->local_nmalloc++;
 	if(size <= MaxSmallSize) {
@@ -72,7 +77,7 @@ 
 		npages = size >> PageShift;
 		if((size & PageMask) != 0)
 			npages++;
-		s = runtime_MHeap_Alloc(&runtime_mheap, npages, 0, 1);
+		s = runtime_MHeap_Alloc(&runtime_mheap, npages, 0, 1, zeroed);
 		if(s == nil)
 			runtime_throw("out of memory");
 		size = npages<<PageShift;
@@ -83,9 +88,20 @@ 
 		// setup for mark sweep
 		runtime_markspan(v, 0, 0, true);
 	}
+
+	if (sizeof(void*) == 4 && c->local_total_alloc >= (1<<30)) {
+		// purge cache stats to prevent overflow
+		runtime_lock(&runtime_mheap);
+		runtime_purgecachedstats(c);
+		runtime_unlock(&runtime_mheap);
+	}
+
 	if(!(flag & FlagNoGC))
 		runtime_markallocated(v, size, (flag&FlagNoPointers) != 0);
 
+	if(DebugTypeAtBlockEnd)
+		*(uintptr*)((uintptr)v+size-sizeof(uintptr)) = 0;
+
 	m->mallocing = 0;
 
 	if(!(flag & FlagNoProfiling) && (rate = runtime_MemProfileRate) > 0) {
@@ -107,6 +123,11 @@ 
 
 	if(dogc && mstats.heap_alloc >= mstats.next_gc)
 		runtime_gc(0);
+
+	if(raceenabled) {
+		runtime_racemalloc(v, size, m->racepc);
+		m->racepc = nil;
+	}
 	return v;
 }
 
@@ -144,6 +165,9 @@ 
 	}
 	prof = runtime_blockspecial(v);
 
+	if(raceenabled)
+		runtime_racefree(v);
+
 	// Find size class for v.
 	sizeclass = s->sizeclass;
 	c = m->mcache;
@@ -178,11 +202,21 @@ 
 int32
 runtime_mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 {
+	M *m;
 	uintptr n, i;
 	byte *p;
 	MSpan *s;
 
-	runtime_m()->mcache->local_nlookup++;
+	m = runtime_m();
+
+	m->mcache->local_nlookup++;
+	if (sizeof(void*) == 4 && m->mcache->local_nlookup >= (1<<30)) {
+		// purge cache stats to prevent overflow
+		runtime_lock(&runtime_mheap);
+		runtime_purgecachedstats(m->mcache);
+		runtime_unlock(&runtime_mheap);
+	}
+
 	s = runtime_MHeap_LookupMaybe(&runtime_mheap, v);
 	if(sp)
 		*sp = s;
@@ -210,7 +244,7 @@ 
 		return 0;
 	}
 
-	n = runtime_class_to_size[s->sizeclass];
+	n = s->elemsize;
 	if(base) {
 		i = ((byte*)v - p)/n;
 		*base = p + i*n;
@@ -224,7 +258,7 @@ 
 MCache*
 runtime_allocmcache(void)
 {
-	int32 rate;
+	intgo rate;
 	MCache *c;
 
 	runtime_lock(&runtime_mheap);
@@ -232,6 +266,7 @@ 
 	mstats.mcache_inuse = runtime_mheap.cachealloc.inuse;
 	mstats.mcache_sys = runtime_mheap.cachealloc.sys;
 	runtime_unlock(&runtime_mheap);
+	runtime_memclr((byte*)c, sizeof(*c));
 
 	// Set first allocation sample size.
 	rate = runtime_MemProfileRate;
@@ -244,12 +279,19 @@ 
 }
 
 void
-runtime_purgecachedstats(M* m)
+runtime_freemcache(MCache *c)
 {
-	MCache *c;
+	runtime_MCache_ReleaseAll(c);
+	runtime_lock(&runtime_mheap);
+	runtime_purgecachedstats(c);
+	runtime_FixAlloc_Free(&runtime_mheap.cachealloc, c);
+	runtime_unlock(&runtime_mheap);
+}
 
+void
+runtime_purgecachedstats(MCache *c)
+{
 	// Protected by either heap or GC lock.
-	c = m->mcache;
 	mstats.heap_alloc += c->local_cachealloc;
 	c->local_cachealloc = 0;
 	mstats.heap_objects += c->local_objects;
@@ -445,6 +487,220 @@ 
 	return p;
 }
 
+static Lock settype_lock;
+
+void
+runtime_settype_flush(M *m, bool sysalloc)
+{
+	uintptr *buf, *endbuf;
+	uintptr size, ofs, j, t;
+	uintptr ntypes, nbytes2, nbytes3;
+	uintptr *data2;
+	byte *data3;
+	bool sysalloc3;
+	void *v;
+	uintptr typ, p;
+	MSpan *s;
+
+	buf = m->settype_buf;
+	endbuf = buf + m->settype_bufsize;
+
+	runtime_lock(&settype_lock);
+	while(buf < endbuf) {
+		v = (void*)*buf;
+		*buf = 0;
+		buf++;
+		typ = *buf;
+		buf++;
+
+		// (Manually inlined copy of runtime_MHeap_Lookup)
+		p = (uintptr)v>>PageShift;
+		if(sizeof(void*) == 8)
+			p -= (uintptr)runtime_mheap.arena_start >> PageShift;
+		s = runtime_mheap.map[p];
+
+		if(s->sizeclass == 0) {
+			s->types.compression = MTypes_Single;
+			s->types.data = typ;
+			continue;
+		}
+
+		size = s->elemsize;
+		ofs = ((uintptr)v - (s->start<<PageShift)) / size;
+
+		switch(s->types.compression) {
+		case MTypes_Empty:
+			ntypes = (s->npages << PageShift) / size;
+			nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
+
+			if(!sysalloc) {
+				data3 = runtime_mallocgc(nbytes3, FlagNoPointers, 0, 1);
+			} else {
+				data3 = runtime_SysAlloc(nbytes3);
+				if(0) runtime_printf("settype(0->3): SysAlloc(%x) --> %p\n", (uint32)nbytes3, data3);
+			}
+
+			s->types.compression = MTypes_Bytes;
+			s->types.sysalloc = sysalloc;
+			s->types.data = (uintptr)data3;
+
+			((uintptr*)data3)[1] = typ;
+			data3[8*sizeof(uintptr) + ofs] = 1;
+			break;
+
+		case MTypes_Words:
+			((uintptr*)s->types.data)[ofs] = typ;
+			break;
+
+		case MTypes_Bytes:
+			data3 = (byte*)s->types.data;
+			for(j=1; j<8; j++) {
+				if(((uintptr*)data3)[j] == typ) {
+					break;
+				}
+				if(((uintptr*)data3)[j] == 0) {
+					((uintptr*)data3)[j] = typ;
+					break;
+				}
+			}
+			if(j < 8) {
+				data3[8*sizeof(uintptr) + ofs] = j;
+			} else {
+				ntypes = (s->npages << PageShift) / size;
+				nbytes2 = ntypes * sizeof(uintptr);
+
+				if(!sysalloc) {
+					data2 = runtime_mallocgc(nbytes2, FlagNoPointers, 0, 1);
+				} else {
+					data2 = runtime_SysAlloc(nbytes2);
+					if(0) runtime_printf("settype.(3->2): SysAlloc(%x) --> %p\n", (uint32)nbytes2, data2);
+				}
+
+				sysalloc3 = s->types.sysalloc;
+
+				s->types.compression = MTypes_Words;
+				s->types.sysalloc = sysalloc;
+				s->types.data = (uintptr)data2;
+
+				// Move the contents of data3 to data2. Then deallocate data3.
+				for(j=0; j<ntypes; j++) {
+					t = data3[8*sizeof(uintptr) + j];
+					t = ((uintptr*)data3)[t];
+					data2[j] = t;
+				}
+				if(sysalloc3) {
+					nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
+					if(0) runtime_printf("settype.(3->2): SysFree(%p,%x)\n", data3, (uint32)nbytes3);
+					runtime_SysFree(data3, nbytes3);
+				}
+
+				data2[ofs] = typ;
+			}
+			break;
+		}
+	}
+	runtime_unlock(&settype_lock);
+
+	m->settype_bufsize = 0;
+}
+
+// It is forbidden to use this function if it is possible that
+// explicit deallocation via calling runtime_free(v) may happen.
+void
+runtime_settype(void *v, uintptr t)
+{
+	M *m1;
+	uintptr *buf;
+	uintptr i;
+	MSpan *s;
+
+	if(t == 0)
+		runtime_throw("settype: zero type");
+
+	m1 = runtime_m();
+	buf = m1->settype_buf;
+	i = m1->settype_bufsize;
+	buf[i+0] = (uintptr)v;
+	buf[i+1] = t;
+	i += 2;
+	m1->settype_bufsize = i;
+
+	if(i == nelem(m1->settype_buf)) {
+		runtime_settype_flush(m1, false);
+	}
+
+	if(DebugTypeAtBlockEnd) {
+		s = runtime_MHeap_Lookup(&runtime_mheap, v);
+		*(uintptr*)((uintptr)v+s->elemsize-sizeof(uintptr)) = t;
+	}
+}
+
+void
+runtime_settype_sysfree(MSpan *s)
+{
+	uintptr ntypes, nbytes;
+
+	if(!s->types.sysalloc)
+		return;
+
+	nbytes = (uintptr)-1;
+
+	switch (s->types.compression) {
+	case MTypes_Words:
+		ntypes = (s->npages << PageShift) / s->elemsize;
+		nbytes = ntypes * sizeof(uintptr);
+		break;
+	case MTypes_Bytes:
+		ntypes = (s->npages << PageShift) / s->elemsize;
+		nbytes = 8*sizeof(uintptr) + 1*ntypes;
+		break;
+	}
+
+	if(nbytes != (uintptr)-1) {
+		if(0) runtime_printf("settype: SysFree(%p,%x)\n", (void*)s->types.data, (uint32)nbytes);
+		runtime_SysFree((void*)s->types.data, nbytes);
+	}
+}
+
+uintptr
+runtime_gettype(void *v)
+{
+	MSpan *s;
+	uintptr t, ofs;
+	byte *data;
+
+	s = runtime_MHeap_LookupMaybe(&runtime_mheap, v);
+	if(s != nil) {
+		t = 0;
+		switch(s->types.compression) {
+		case MTypes_Empty:
+			break;
+		case MTypes_Single:
+			t = s->types.data;
+			break;
+		case MTypes_Words:
+			ofs = (uintptr)v - (s->start<<PageShift);
+			t = ((uintptr*)s->types.data)[ofs/s->elemsize];
+			break;
+		case MTypes_Bytes:
+			ofs = (uintptr)v - (s->start<<PageShift);
+			data = (byte*)s->types.data;
+			t = data[8*sizeof(uintptr) + ofs/s->elemsize];
+			t = ((uintptr*)data)[t];
+			break;
+		default:
+			runtime_throw("runtime_gettype: invalid compression kind");
+		}
+		if(0) {
+			runtime_lock(&settype_lock);
+			runtime_printf("%p -> %d,%X\n", v, (int32)s->types.compression, (int64)t);
+			runtime_unlock(&settype_lock);
+		}
+		return t;
+	}
+	return 0;
+}
+
 // Runtime stubs.
 
 void*
@@ -453,9 +709,24 @@ 
 	return runtime_mallocgc(n, 0, 1, 1);
 }
 
-func new(typ *Type) (ret *uint8) {
-	uint32 flag = typ->__code&GO_NO_POINTERS ? FlagNoPointers : 0;
+void *
+runtime_new(Type *typ)
+{
+	void *ret;
+	uint32 flag;
+
+	runtime_m()->racepc = runtime_getcallerpc(&typ);
+	flag = typ->__code&GO_NO_POINTERS ? FlagNoPointers : 0;
 	ret = runtime_mallocgc(typ->__size, flag, 1, 1);
+
+	if(UseSpanType && !flag) {
+		if(false) {
+			runtime_printf("new %S: %p\n", *typ->__reflection, ret);
+		}
+		runtime_settype(ret, (uintptr)typ | TypeInfo_SingleObject);
+	}
+
+	return ret;
 }
 
 func GC() {
diff -r bf12a7f41b67 libgo/runtime/malloc.h
--- a/libgo/runtime/malloc.h	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/malloc.h	Mon Oct 22 17:36:23 2012 -0700
@@ -85,6 +85,7 @@ 
 typedef struct MSpan	MSpan;
 typedef struct MStats	MStats;
 typedef struct MLink	MLink;
+typedef struct MTypes	MTypes;
 
 enum
 {
@@ -124,8 +125,8 @@ 
 	// Max number of threads to run garbage collection.
 	// 2, 3, and 4 are all plausible maximums depending
 	// on the hardware details of the machine.  The garbage
-	// collector scales well to 4 cpus.
-	MaxGcproc = 4,
+	// collector scales well to 8 cpus.
+	MaxGcproc = 8,
 };
 
 // Maximum memory allocation size, a hint for callers.
@@ -282,19 +283,19 @@ 
 struct MCache
 {
 	MCacheList list[NumSizeClasses];
-	uint64 size;
-	int64 local_cachealloc;	// bytes allocated (or freed) from cache since last lock of heap
-	int64 local_objects;	// objects allocated (or freed) from cache since last lock of heap
-	int64 local_alloc;	// bytes allocated (or freed) since last lock of heap
-	int64 local_total_alloc;	// bytes allocated (even if freed) since last lock of heap
-	int64 local_nmalloc;	// number of mallocs since last lock of heap
-	int64 local_nfree;	// number of frees since last lock of heap
-	int64 local_nlookup;	// number of pointer lookups since last lock of heap
+	uintptr size;
+	intptr local_cachealloc;	// bytes allocated (or freed) from cache since last lock of heap
+	intptr local_objects;	// objects allocated (or freed) from cache since last lock of heap
+	intptr local_alloc;	// bytes allocated (or freed) since last lock of heap
+	uintptr local_total_alloc;	// bytes allocated (even if freed) since last lock of heap
+	uintptr local_nmalloc;	// number of mallocs since last lock of heap
+	uintptr local_nfree;	// number of frees since last lock of heap
+	uintptr local_nlookup;	// number of pointer lookups since last lock of heap
 	int32 next_sample;	// trigger heap sample after allocating this many bytes
 	// Statistics about allocation size classes since last lock of heap
 	struct {
-		int64 nmalloc;
-		int64 nfree;
+		uintptr nmalloc;
+		uintptr nfree;
 	} local_by_size[NumSizeClasses];
 
 };
@@ -303,6 +304,44 @@ 
 void	runtime_MCache_Free(MCache *c, void *p, int32 sizeclass, uintptr size);
 void	runtime_MCache_ReleaseAll(MCache *c);
 
+// MTypes describes the types of blocks allocated within a span.
+// The compression field describes the layout of the data.
+//
+// MTypes_Empty:
+//     All blocks are free, or no type information is available for
+//     allocated blocks.
+//     The data field has no meaning.
+// MTypes_Single:
+//     The span contains just one block.
+//     The data field holds the type information.
+//     The sysalloc field has no meaning.
+// MTypes_Words:
+//     The span contains multiple blocks.
+//     The data field points to an array of type [NumBlocks]uintptr,
+//     and each element of the array holds the type of the corresponding
+//     block.
+// MTypes_Bytes:
+//     The span contains at most seven different types of blocks.
+//     The data field points to the following structure:
+//         struct {
+//             type  [8]uintptr       // type[0] is always 0
+//             index [NumBlocks]byte
+//         }
+//     The type of the i-th block is: data.type[data.index[i]]
+enum
+{
+	MTypes_Empty = 0,
+	MTypes_Single = 1,
+	MTypes_Words = 2,
+	MTypes_Bytes = 3,
+};
+struct MTypes
+{
+	byte	compression;	// one of MTypes_*
+	bool	sysalloc;	// whether (void*)data is from runtime_SysAlloc
+	uintptr	data;
+};
+
 // An MSpan is a run of pages.
 enum
 {
@@ -315,16 +354,17 @@ 
 {
 	MSpan	*next;		// in a span linked list
 	MSpan	*prev;		// in a span linked list
-	MSpan	*allnext;	// in the list of all spans
 	PageID	start;		// starting page number
 	uintptr	npages;		// number of pages in span
 	MLink	*freelist;	// list of free objects
 	uint32	ref;		// number of allocated objects in this span
 	uint32	sizeclass;	// size class
+	uintptr	elemsize;	// computed from sizeclass or from npages
 	uint32	state;		// MSpanInUse etc
 	int64   unusedsince;	// First time spotted by GC in MSpanFree state
 	uintptr npreleased;	// number of pages released to the OS
 	byte	*limit;		// end of data in span
+	MTypes	types;		// types of allocated objects in this span
 };
 
 void	runtime_MSpan_Init(MSpan *span, PageID start, uintptr npages);
@@ -351,6 +391,7 @@ 
 void	runtime_MCentral_Init(MCentral *c, int32 sizeclass);
 int32	runtime_MCentral_AllocList(MCentral *c, int32 n, MLink **first);
 void	runtime_MCentral_FreeList(MCentral *c, int32 n, MLink *first);
+void	runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end);
 
 // Main malloc heap.
 // The heap itself is the "free[]" and "large" arrays,
@@ -360,7 +401,9 @@ 
 	Lock;
 	MSpan free[MaxMHeapList];	// free lists of given length
 	MSpan large;			// free lists length >= MaxMHeapList
-	MSpan *allspans;
+	MSpan **allspans;
+	uint32	nspan;
+	uint32	nspancap;
 
 	// span lookup
 	MSpan *map[1<<MHeapMap_Bits];
@@ -387,7 +430,7 @@ 
 extern MHeap runtime_mheap;
 
 void	runtime_MHeap_Init(MHeap *h, void *(*allocator)(uintptr));
-MSpan*	runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct);
+MSpan*	runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed);
 void	runtime_MHeap_Free(MHeap *h, MSpan *s, int32 acct);
 MSpan*	runtime_MHeap_Lookup(MHeap *h, void *v);
 MSpan*	runtime_MHeap_LookupMaybe(MHeap *h, void *v);
@@ -408,7 +451,12 @@ 
 void	runtime_unmarkspan(void *v, uintptr size);
 bool	runtime_blockspecial(void*);
 void	runtime_setblockspecial(void*, bool);
-void	runtime_purgecachedstats(M*);
+void	runtime_purgecachedstats(MCache*);
+
+void	runtime_settype(void*, uintptr);
+void	runtime_settype_flush(M*, bool);
+void	runtime_settype_sysfree(MSpan*);
+uintptr	runtime_gettype(void*);
 
 enum
 {
@@ -421,10 +469,21 @@ 
 void	runtime_MProf_Malloc(void*, uintptr);
 void	runtime_MProf_Free(void*, uintptr);
 void	runtime_MProf_GC(void);
-void	runtime_MProf_Mark(void (*scan)(byte *, int64));
-int32	runtime_helpgc(bool*);
+void	runtime_MProf_Mark(void (*addroot)(byte *, uintptr));
+int32	runtime_gcprocs(void);
+void	runtime_helpgc(int32 nproc);
 void	runtime_gchelper(void);
 
 struct __go_func_type;
 bool	runtime_getfinalizer(void *p, bool del, void (**fn)(void*), const struct __go_func_type **ft);
-void	runtime_walkfintab(void (*fn)(void*), void (*scan)(byte *, int64));
+void	runtime_walkfintab(void (*fn)(void*), void (*scan)(byte *, uintptr));
+
+enum
+{
+	TypeInfo_SingleObject = 0,
+	TypeInfo_Array = 1,
+	TypeInfo_Map = 2,
+
+	// Enables type information at the end of blocks allocated from heap	
+	DebugTypeAtBlockEnd = 0,
+};
diff -r bf12a7f41b67 libgo/runtime/mcache.c
--- a/libgo/runtime/mcache.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/mcache.c	Mon Oct 22 17:36:23 2012 -0700
@@ -43,11 +43,6 @@ 
 		// block is zeroed iff second word is zero ...
 		if(size > sizeof(uintptr) && ((uintptr*)v)[1] != 0)
 			runtime_memclr((byte*)v, size);
-		else {
-			// ... except for the link pointer
-			// that we used above; zero that.
-			v->next = nil;
-		}
 	}
 	c->local_cachealloc += size;
 	c->local_objects++;
diff -r bf12a7f41b67 libgo/runtime/mcentral.c
--- a/libgo/runtime/mcentral.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/mcentral.c	Mon Oct 22 17:36:23 2012 -0700
@@ -88,9 +88,6 @@ 
 }
 
 // Free n objects back into the central free list.
-// Return the number of objects allocated.
-// The objects are linked together by their first words.
-// On return, *pstart points at the first object and *pend at the last.
 void
 runtime_MCentral_FreeList(MCentral *c, int32 n, MLink *start)
 {
@@ -148,6 +145,42 @@ 
 	}
 }
 
+// Free n objects from a span s back into the central free list c.
+// Called from GC.
+void
+runtime_MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end)
+{
+	int32 size;
+
+	runtime_lock(c);
+
+	// Move to nonempty if necessary.
+	if(s->freelist == nil) {
+		runtime_MSpanList_Remove(s);
+		runtime_MSpanList_Insert(&c->nonempty, s);
+	}
+
+	// Add the objects back to s's free list.
+	end->next = s->freelist;
+	s->freelist = start;
+	s->ref -= n;
+	c->nfree += n;
+
+	// If s is completely freed, return it to the heap.
+	if(s->ref == 0) {
+		size = runtime_class_to_size[c->sizeclass];
+		runtime_MSpanList_Remove(s);
+		*(uintptr*)(s->start<<PageShift) = 1;  // needs zeroing
+		s->freelist = nil;
+		c->nfree -= (s->npages << PageShift) / size;
+		runtime_unlock(c);
+		runtime_unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
+		runtime_MHeap_Free(&runtime_mheap, s, 0);
+	} else {
+		runtime_unlock(c);
+	}
+}
+
 void
 runtime_MGetSizeClassInfo(int32 sizeclass, uintptr *sizep, int32 *npagesp, int32 *nobj)
 {
@@ -174,7 +207,7 @@ 
 
 	runtime_unlock(c);
 	runtime_MGetSizeClassInfo(c->sizeclass, &size, &npages, &n);
-	s = runtime_MHeap_Alloc(&runtime_mheap, npages, c->sizeclass, 0);
+	s = runtime_MHeap_Alloc(&runtime_mheap, npages, c->sizeclass, 0, 1);
 	if(s == nil) {
 		// TODO(rsc): Log out of memory
 		runtime_lock(c);
diff -r bf12a7f41b67 libgo/runtime/mfinal.c
--- a/libgo/runtime/mfinal.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/mfinal.c	Mon Oct 22 17:36:23 2012 -0700
@@ -193,7 +193,7 @@ 
 }
 
 void
-runtime_walkfintab(void (*fn)(void*), void (*scan)(byte *, int64))
+runtime_walkfintab(void (*fn)(void*), void (*addroot)(byte *, uintptr))
 {
 	void **key;
 	void **ekey;
@@ -206,8 +206,8 @@ 
 		for(; key < ekey; key++)
 			if(*key != nil && *key != ((void*)-1))
 				fn(*key);
-		scan((byte*)&fintab[i].fkey, sizeof(void*));
-		scan((byte*)&fintab[i].val, sizeof(void*));
+		addroot((byte*)&fintab[i].fkey, sizeof(void*));
+		addroot((byte*)&fintab[i].val, sizeof(void*));
 		runtime_unlock(&fintab[i]);
 	}
 }
diff -r bf12a7f41b67 libgo/runtime/mgc0.c
--- a/libgo/runtime/mgc0.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/mgc0.c	Mon Oct 22 17:36:23 2012 -0700
@@ -9,6 +9,7 @@ 
 #include "runtime.h"
 #include "arch.h"
 #include "malloc.h"
+#include "race.h"
 
 #ifdef USING_SPLIT_STACK
 
@@ -22,8 +23,8 @@ 
 
 enum {
 	Debug = 0,
-	PtrSize = sizeof(void*),
 	DebugMark = 0,  // run second pass to check mark
+	DataBlock = 8*1024,
 
 	// Four bits per word (see #defines below).
 	wordsPerBitmapWord = sizeof(void*)*8/4,
@@ -78,17 +79,14 @@ 
 //
 uint32 runtime_worldsema = 1;
 
-// TODO: Make these per-M.
-static uint64 nhandoff;
-
 static int32 gctrace;
 
 typedef struct Workbuf Workbuf;
 struct Workbuf
 {
-	Workbuf *next;
+	LFNode node; // must be first
 	uintptr nobj;
-	byte *obj[512-2];
+	byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)];
 };
 
 typedef struct Finalizer Finalizer;
@@ -122,22 +120,32 @@ 
 static void	putempty(Workbuf*);
 static Workbuf* handoff(Workbuf*);
 
+typedef struct GcRoot GcRoot;
+struct GcRoot
+{
+	byte *p;
+	uintptr n;
+};
+
 static struct {
-	Lock fmu;
-	Workbuf	*full;
-	Lock emu;
-	Workbuf	*empty;
+	uint64	full;  // lock-free list of full blocks
+	uint64	empty; // lock-free list of empty blocks
+	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
 	uint32	nproc;
 	volatile uint32	nwait;
 	volatile uint32	ndone;
+	volatile uint32 debugmarkdone;
 	Note	alldone;
-	Lock	markgate;
-	Lock	sweepgate;
-	MSpan	*spans;
+	ParFor	*markfor;
+	ParFor	*sweepfor;
 
 	Lock;
 	byte	*chunk;
 	uintptr	nchunk;
+
+	GcRoot	*roots;
+	uint32	nroot;
+	uint32	rootcap;
 } work;
 
 // scanblock scans a block of n bytes starting at pointer b for references
@@ -147,7 +155,7 @@ 
 // body.  Keeping an explicit work list is easier on the stack allocator and
 // more efficient.
 static void
-scanblock(byte *b, int64 n)
+scanblock(byte *b, uintptr n)
 {
 	byte *obj, *arena_start, *arena_used, *p;
 	void **vp;
@@ -158,8 +166,8 @@ 
 	Workbuf *wbuf;
 	bool keepworking;
 
-	if((int64)(uintptr)n != n || n < 0) {
-		runtime_printf("scanblock %p %D\n", b, n);
+	if((intptr)n < 0) {
+		runtime_printf("scanblock %p %D\n", b, (int64)n);
 		runtime_throw("scanblock");
 	}
 
@@ -173,7 +181,7 @@ 
 	nobj = 0;  // number of queued objects
 
 	// Scanblock helpers pass b==nil.
-	// The main proc needs to return to make more
+	// Procs needs to return to make more
 	// calls to scanblock.  But if work.nproc==1 then
 	// might as well process blocks as soon as we
 	// have them.
@@ -190,7 +198,7 @@ 
 		// Each iteration scans the block b of length n, queueing pointers in
 		// the work buffer.
 		if(Debug > 1)
-			runtime_printf("scanblock %p %D\n", b, n);
+			runtime_printf("scanblock %p %D\n", b, (int64)n);
 
 		vp = (void**)b;
 		n >>= (2+PtrSize/8);  /* n /= PtrSize (4 or 8) */
@@ -257,6 +265,14 @@ 
 			bits = xbits >> shift;
 
 		found:
+			// If another proc wants a pointer, give it some.
+			if(work.nwait > 0 && nobj > 4 && work.full == 0) {
+				wbuf->nobj = nobj;
+				wbuf = handoff(wbuf);
+				nobj = wbuf->nobj;
+				wp = (void**)(wbuf->obj + nobj);
+			}
+
 			// Now we have bits, bitp, and shift correct for
 			// obj pointing at the base of the object.
 			// Only care about allocated and not marked.
@@ -278,13 +294,7 @@ 
 			if((bits & bitNoPointers) != 0)
 				continue;
 
-			// If another proc wants a pointer, give it some.
-			if(nobj > 4 && work.nwait > 0 && work.full == nil) {
-				wbuf->nobj = nobj;
-				wbuf = handoff(wbuf);
-				nobj = wbuf->nobj;
-				wp = (void**)(wbuf->obj + nobj);
-			}
+			PREFETCH(obj);
 
 			// If buffer is full, get a new one.
 			if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
@@ -305,7 +315,8 @@ 
 		// Fetch b from the work buffer.
 		if(nobj == 0) {
 			if(!keepworking) {
-				putempty(wbuf);
+				if(wbuf)
+					putempty(wbuf);
 				return;
 			}
 			// Emptied our buffer: refill.
@@ -335,7 +346,7 @@ 
 // it is simpler, slower, single-threaded, recursive,
 // and uses bitSpecial as the mark bit.
 static void
-debug_scanblock(byte *b, int64 n)
+debug_scanblock(byte *b, uintptr n)
 {
 	byte *obj, *p;
 	void **vp;
@@ -345,8 +356,8 @@ 
 	if(!DebugMark)
 		runtime_throw("debug_scanblock without DebugMark");
 
-	if((int64)(uintptr)n != n || n < 0) {
-		runtime_printf("debug_scanblock %p %D\n", b, n);
+	if((intptr)n < 0) {
+		runtime_printf("debug_scanblock %p %D\n", b, (int64)n);
 		runtime_throw("debug_scanblock");
 	}
 
@@ -374,7 +385,6 @@ 
 		if(s == nil)
 			continue;
 
-
 		p =  (byte*)((uintptr)s->start<<PageShift);
 		if(s->sizeclass == 0) {
 			obj = p;
@@ -411,53 +421,33 @@ 
 	}
 }
 
+static void
+markroot(ParFor *desc, uint32 i)
+{
+	USED(&desc);
+	scanblock(work.roots[i].p, work.roots[i].n);
+}
+
 // Get an empty work buffer off the work.empty list,
 // allocating new buffers as needed.
 static Workbuf*
 getempty(Workbuf *b)
 {
-	if(work.nproc == 1) {
-		// Put b on full list.
-		if(b != nil) {
-			b->next = work.full;
-			work.full = b;
+	if(b != nil)
+		runtime_lfstackpush(&work.full, &b->node);
+	b = (Workbuf*)runtime_lfstackpop(&work.empty);
+	if(b == nil) {
+		// Need to allocate.
+		runtime_lock(&work);
+		if(work.nchunk < sizeof *b) {
+			work.nchunk = 1<<20;
+			work.chunk = runtime_SysAlloc(work.nchunk);
 		}
-		// Grab from empty list if possible.
-		b = work.empty;
-		if(b != nil) {
-			work.empty = b->next;
-			goto haveb;
-		}
-	} else {
-		// Put b on full list.
-		if(b != nil) {
-			runtime_lock(&work.fmu);
-			b->next = work.full;
-			work.full = b;
-			runtime_unlock(&work.fmu);
-		}
-		// Grab from empty list if possible.
-		runtime_lock(&work.emu);
-		b = work.empty;
-		if(b != nil)
-			work.empty = b->next;
-		runtime_unlock(&work.emu);
-		if(b != nil)
-			goto haveb;
+		b = (Workbuf*)work.chunk;
+		work.chunk += sizeof *b;
+		work.nchunk -= sizeof *b;
+		runtime_unlock(&work);
 	}
-
-	// Need to allocate.
-	runtime_lock(&work);
-	if(work.nchunk < sizeof *b) {
-		work.nchunk = 1<<20;
-		work.chunk = runtime_SysAlloc(work.nchunk);
-	}
-	b = (Workbuf*)work.chunk;
-	work.chunk += sizeof *b;
-	work.nchunk -= sizeof *b;
-	runtime_unlock(&work);
-
-haveb:
 	b->nobj = 0;
 	return b;
 }
@@ -465,112 +455,95 @@ 
 static void
 putempty(Workbuf *b)
 {
-	if(b == nil)
-		return;
-
-	if(work.nproc == 1) {
-		b->next = work.empty;
-		work.empty = b;
-		return;
-	}
-
-	runtime_lock(&work.emu);
-	b->next = work.empty;
-	work.empty = b;
-	runtime_unlock(&work.emu);
+	runtime_lfstackpush(&work.empty, &b->node);
 }
 
 // Get a full work buffer off the work.full list, or return nil.
 static Workbuf*
 getfull(Workbuf *b)
 {
+	M *m;
 	int32 i;
-	Workbuf *b1;
 
-	if(work.nproc == 1) {
-		// Put b on empty list.
-		if(b != nil) {
-			b->next = work.empty;
-			work.empty = b;
-		}
-		// Grab from full list if possible.
-		// Since work.nproc==1, no one else is
-		// going to give us work.
-		b = work.full;
-		if(b != nil)
-			work.full = b->next;
+	if(b != nil)
+		runtime_lfstackpush(&work.empty, &b->node);
+	b = (Workbuf*)runtime_lfstackpop(&work.full);
+	if(b != nil || work.nproc == 1)
 		return b;
-	}
 
-	putempty(b);
-
-	// Grab buffer from full list if possible.
-	for(;;) {
-		b1 = work.full;
-		if(b1 == nil)
-			break;
-		runtime_lock(&work.fmu);
-		if(work.full != nil) {
-			b1 = work.full;
-			work.full = b1->next;
-			runtime_unlock(&work.fmu);
-			return b1;
-		}
-		runtime_unlock(&work.fmu);
-	}
-
+	m = runtime_m();
 	runtime_xadd(&work.nwait, +1);
 	for(i=0;; i++) {
-		b1 = work.full;
-		if(b1 != nil) {
-			runtime_lock(&work.fmu);
-			if(work.full != nil) {
-				runtime_xadd(&work.nwait, -1);
-				b1 = work.full;
-				work.full = b1->next;
-				runtime_unlock(&work.fmu);
-				return b1;
-			}
-			runtime_unlock(&work.fmu);
-			continue;
+		if(work.full != 0) {
+			runtime_xadd(&work.nwait, -1);
+			b = (Workbuf*)runtime_lfstackpop(&work.full);
+			if(b != nil)
+				return b;
+			runtime_xadd(&work.nwait, +1);
 		}
 		if(work.nwait == work.nproc)
 			return nil;
-		if(i < 10)
+		if(i < 10) {
+			m->gcstats.nprocyield++;
 			runtime_procyield(20);
-		else if(i < 20)
+		} else if(i < 20) {
+			m->gcstats.nosyield++;
 			runtime_osyield();
-		else
+		} else {
+			m->gcstats.nsleep++;
 			runtime_usleep(100);
+		}
 	}
 }
 
 static Workbuf*
 handoff(Workbuf *b)
 {
+	M *m;
 	int32 n;
 	Workbuf *b1;
 
+	m = runtime_m();
+
 	// Make new buffer with half of b's pointers.
 	b1 = getempty(nil);
 	n = b->nobj/2;
 	b->nobj -= n;
 	b1->nobj = n;
 	runtime_memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
-	nhandoff += n;
+	m->gcstats.nhandoff++;
+	m->gcstats.nhandoffcnt += n;
 
 	// Put b on full list - let first half of b get stolen.
-	runtime_lock(&work.fmu);
-	b->next = work.full;
-	work.full = b;
-	runtime_unlock(&work.fmu);
-
+	runtime_lfstackpush(&work.full, &b->node);
 	return b1;
 }
 
-// Scanstack calls scanblock on each of gp's stack segments.
 static void
-scanstack(void (*scanblock)(byte*, int64), G *gp)
+addroot(byte *p, uintptr n)
+{
+	uint32 cap;
+	GcRoot *new;
+
+	if(work.nroot >= work.rootcap) {
+		cap = PageSize/sizeof(GcRoot);
+		if(cap < 2*work.rootcap)
+			cap = 2*work.rootcap;
+		new = (GcRoot*)runtime_SysAlloc(cap*sizeof(GcRoot));
+		if(work.roots != nil) {
+			runtime_memmove(new, work.roots, work.rootcap*sizeof(GcRoot));
+			runtime_SysFree(work.roots, work.rootcap*sizeof(GcRoot));
+		}
+		work.roots = new;
+		work.rootcap = cap;
+	}
+	work.roots[work.nroot].p = p;
+	work.roots[work.nroot].n = n;
+	work.nroot++;
+}
+
+static void
+addstackroots(G *gp)
 {
 #ifdef USING_SPLIT_STACK
 	M *mp;
@@ -609,11 +582,11 @@ 
 		}
 	}
 	if(sp != nil) {
-		scanblock(sp, spsize);
+		addroot(sp, spsize);
 		while((sp = __splitstack_find(next_segment, next_sp,
 					      &spsize, &next_segment,
 					      &next_sp, &initial_sp)) != nil)
-			scanblock(sp, spsize);
+			addroot(sp, spsize);
 	}
 #else
 	M *mp;
@@ -635,16 +608,14 @@ 
 	}
 	top = (byte*)gp->gcinitial_sp + gp->gcstack_size;
 	if(top > bottom)
-		scanblock(bottom, top - bottom);
+		addroot(bottom, top - bottom);
 	else
-		scanblock(top, bottom - top);
+		addroot(top, bottom - top);
 #endif
 }
 
-// Markfin calls scanblock on the blocks that have finalizers:
-// the things pointed at cannot be freed until the finalizers have run.
 static void
-markfin(void *v)
+addfinroots(void *v)
 {
 	uintptr size;
 
@@ -653,7 +624,7 @@ 
 		runtime_throw("mark - finalizer inconsistency");
 
 	// do not mark the finalizer block itself.  just mark the things it points at.
-	scanblock(v, size);
+	addroot(v, size);
 }
 
 static struct root_list* roots;
@@ -668,22 +639,15 @@ 
 }
 
 static void
-debug_markfin(void *v)
-{
-	uintptr size;
-
-	if(!runtime_mlookup(v, (byte**)&v, &size, nil))
-		runtime_throw("debug_mark - finalizer inconsistency");
-	debug_scanblock(v, size);
-}
-
-// Mark
-static void
-mark(void (*scan)(byte*, int64))
+addroots(void)
 {
 	struct root_list *pl;
 	G *gp;
 	FinBlock *fb;
+	MSpan *s, **allspans;
+	uint32 spanidx;
+
+	work.nroot = 0;
 
 	// mark data+bss.
 	for(pl = roots; pl != nil; pl = pl->next) {
@@ -692,20 +656,36 @@ 
 			void *decl = pr->decl;
 			if(decl == nil)
 				break;
-			scanblock(decl, pr->size);
+			addroot(decl, pr->size);
 			pr++;
 		}
 	}
 
-	scan((byte*)&runtime_m0, sizeof runtime_m0);
-	scan((byte*)&runtime_g0, sizeof runtime_g0);
-	scan((byte*)&runtime_allg, sizeof runtime_allg);
-	scan((byte*)&runtime_allm, sizeof runtime_allm);
-	runtime_MProf_Mark(scan);
-	runtime_time_scan(scan);
-	runtime_trampoline_scan(scan);
+	addroot((byte*)&runtime_m0, sizeof runtime_m0);
+	addroot((byte*)&runtime_g0, sizeof runtime_g0);
+	addroot((byte*)&runtime_allg, sizeof runtime_allg);
+	addroot((byte*)&runtime_allm, sizeof runtime_allm);
+	runtime_MProf_Mark(addroot);
+	runtime_time_scan(addroot);
+	runtime_trampoline_scan(addroot);
 
-	// mark stacks
+	// MSpan.types
+	allspans = runtime_mheap.allspans;
+	for(spanidx=0; spanidx<runtime_mheap.nspan; spanidx++) {
+		s = allspans[spanidx];
+		if(s->state == MSpanInUse) {
+			switch(s->types.compression) {
+			case MTypes_Empty:
+			case MTypes_Single:
+				break;
+			case MTypes_Words:
+			case MTypes_Bytes:
+				addroot((byte*)&s->types.data, sizeof(void*));
+				break;
+			}
+		}
+	}
+
 	for(gp=runtime_allg; gp!=nil; gp=gp->alllink) {
 		switch(gp->status){
 		default:
@@ -716,27 +696,22 @@ 
 		case Grunning:
 			if(gp != runtime_g())
 				runtime_throw("mark - world not stopped");
-			scanstack(scan, gp);
+			addstackroots(gp);
 			break;
 		case Grunnable:
 		case Gsyscall:
 		case Gwaiting:
-			scanstack(scan, gp);
+			addstackroots(gp);
 			break;
 		}
 	}
 
-	// mark things pointed at by objects with finalizers
-	if(scan == debug_scanblock)
-		runtime_walkfintab(debug_markfin, scan);
-	else
-		runtime_walkfintab(markfin, scan);
+	runtime_walkfintab(addfinroots, addroot);
 
 	for(fb=allfin; fb; fb=fb->alllink)
-		scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
+		addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
 
-	// in multiproc mode, join in the queued work.
-	scan(nil, 0);
+	addroot((byte*)&work, sizeof work);
 }
 
 static bool
@@ -771,122 +746,149 @@ 
 	f->fn = fn;
 	f->ft = ft;
 	f->arg = p;
-	runtime_unlock(&finlock); 
+	runtime_unlock(&finlock);
 	return true;
 }
 
 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
 // It clears the mark bits in preparation for the next GC round.
 static void
-sweep(void)
+sweepspan(ParFor *desc, uint32 idx)
 {
 	M *m;
-	MSpan *s;
 	int32 cl, n, npages;
 	uintptr size;
 	byte *p;
 	MCache *c;
 	byte *arena_start;
-	int64 now;
+	MLink head, *end;
+	int32 nfree;
+	byte *type_data;
+	byte compression;
+	uintptr type_data_inc;
+	MSpan *s;
 
 	m = runtime_m();
+
+	USED(&desc);
+	s = runtime_mheap.allspans[idx];
+	// Stamp newly unused spans. The scavenger will use that
+	// info to potentially give back some pages to the OS.
+	if(s->state == MSpanFree && s->unusedsince == 0)
+		s->unusedsince = runtime_nanotime();
+	if(s->state != MSpanInUse)
+		return;
 	arena_start = runtime_mheap.arena_start;
-	now = runtime_nanotime();
+	p = (byte*)(s->start << PageShift);
+	cl = s->sizeclass;
+	size = s->elemsize;
+	if(cl == 0) {
+		n = 1;
+	} else {
+		// Chunk full of small blocks.
+		npages = runtime_class_to_allocnpages[cl];
+		n = (npages << PageShift) / size;
+	}
+	nfree = 0;
+	end = &head;
+	c = m->mcache;
+	
+	type_data = (byte*)s->types.data;
+	type_data_inc = sizeof(uintptr);
+	compression = s->types.compression;
+	switch(compression) {
+	case MTypes_Bytes:
+		type_data += 8*sizeof(uintptr);
+		type_data_inc = 1;
+		break;
+	}
 
-	for(;;) {
-		s = work.spans;
-		if(s == nil)
-			break;
-		if(!runtime_casp(&work.spans, s, s->allnext))
+	// Sweep through n objects of given size starting at p.
+	// This thread owns the span now, so it can manipulate
+	// the block bitmap without atomic operations.
+	for(; n > 0; n--, p += size, type_data+=type_data_inc) {
+		uintptr off, *bitp, shift, bits;
+
+		off = (uintptr*)p - (uintptr*)arena_start;
+		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
+		shift = off % wordsPerBitmapWord;
+		bits = *bitp>>shift;
+
+		if((bits & bitAllocated) == 0)
 			continue;
 
-		// Stamp newly unused spans. The scavenger will use that
-		// info to potentially give back some pages to the OS.
-		if(s->state == MSpanFree && s->unusedsince == 0)
-			s->unusedsince = now;
-
-		if(s->state != MSpanInUse)
+		if((bits & bitMarked) != 0) {
+			if(DebugMark) {
+				if(!(bits & bitSpecial))
+					runtime_printf("found spurious mark on %p\n", p);
+				*bitp &= ~(bitSpecial<<shift);
+			}
+			*bitp &= ~(bitMarked<<shift);
 			continue;
-
-		p = (byte*)(s->start << PageShift);
-		cl = s->sizeclass;
-		if(cl == 0) {
-			size = s->npages<<PageShift;
-			n = 1;
-		} else {
-			// Chunk full of small blocks.
-			size = runtime_class_to_size[cl];
-			npages = runtime_class_to_allocnpages[cl];
-			n = (npages << PageShift) / size;
 		}
 
-		// Sweep through n objects of given size starting at p.
-		// This thread owns the span now, so it can manipulate
-		// the block bitmap without atomic operations.
-		for(; n > 0; n--, p += size) {
-			uintptr off, *bitp, shift, bits;
+		// Special means it has a finalizer or is being profiled.
+		// In DebugMark mode, the bit has been coopted so
+		// we have to assume all blocks are special.
+		if(DebugMark || (bits & bitSpecial) != 0) {
+			if(handlespecial(p, size))
+				continue;
+		}
 
-			off = (uintptr*)p - (uintptr*)arena_start;
-			bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-			shift = off % wordsPerBitmapWord;
-			bits = *bitp>>shift;
+		// Mark freed; restore block boundary bit.
+		*bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
 
-			if((bits & bitAllocated) == 0)
-				continue;
-
-			if((bits & bitMarked) != 0) {
-				if(DebugMark) {
-					if(!(bits & bitSpecial))
-						runtime_printf("found spurious mark on %p\n", p);
-					*bitp &= ~(bitSpecial<<shift);
-				}
-				*bitp &= ~(bitMarked<<shift);
-				continue;
-			}
-
-			// Special means it has a finalizer or is being profiled.
-			// In DebugMark mode, the bit has been coopted so
-			// we have to assume all blocks are special.
-			if(DebugMark || (bits & bitSpecial) != 0) {
-				if(handlespecial(p, size))
-					continue;
-			}
-
-			// Mark freed; restore block boundary bit.
-			*bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
-
-			c = m->mcache;
-			if(s->sizeclass == 0) {
-				// Free large span.
-				runtime_unmarkspan(p, 1<<PageShift);
-				*(uintptr*)p = 1;	// needs zeroing
-				runtime_MHeap_Free(&runtime_mheap, s, 1);
-			} else {
-				// Free small object.
-				if(size > sizeof(uintptr))
-					((uintptr*)p)[1] = 1;	// mark as "needs to be zeroed"
-				c->local_by_size[s->sizeclass].nfree++;
-				runtime_MCache_Free(c, p, s->sizeclass, size);
-			}
+		if(cl == 0) {
+			// Free large span.
+			runtime_unmarkspan(p, 1<<PageShift);
+			*(uintptr*)p = 1;	// needs zeroing
+			runtime_MHeap_Free(&runtime_mheap, s, 1);
 			c->local_alloc -= size;
 			c->local_nfree++;
+		} else {
+			// Free small object.
+			switch(compression) {
+			case MTypes_Words:
+				*(uintptr*)type_data = 0;
+				break;
+			case MTypes_Bytes:
+				*(byte*)type_data = 0;
+				break;
+			}
+			if(size > sizeof(uintptr))
+				((uintptr*)p)[1] = 1;	// mark as "needs to be zeroed"
+			
+			end->next = (MLink*)p;
+			end = (MLink*)p;
+			nfree++;
 		}
 	}
+
+	if(nfree) {
+		c->local_by_size[cl].nfree += nfree;
+		c->local_alloc -= size * nfree;
+		c->local_nfree += nfree;
+		c->local_cachealloc -= nfree * size;
+		c->local_objects -= nfree;
+		runtime_MCentral_FreeSpan(&runtime_mheap.central[cl], s, nfree, head.next, end);
+	}
 }
 
 void
 runtime_gchelper(void)
 {
-	// Wait until main proc is ready for mark help.
-	runtime_lock(&work.markgate);
-	runtime_unlock(&work.markgate);
+	// parallel mark for over gc roots
+	runtime_parfordo(work.markfor);
+	// help other threads scan secondary blocks
 	scanblock(nil, 0);
 
-	// Wait until main proc is ready for sweep help.
-	runtime_lock(&work.sweepgate);
-	runtime_unlock(&work.sweepgate);
-	sweep();
+	if(DebugMark) {
+		// wait while the main thread executes mark(debug_scanblock)
+		while(runtime_atomicload(&work.debugmarkdone) == 0)
+			runtime_usleep(10);
+	}
 
+	runtime_parfordo(work.sweepfor);
 	if(runtime_xadd(&work.ndone, +1) == work.nproc-1)
 		runtime_notewakeup(&work.alldone);
 }
@@ -912,21 +914,31 @@ 
 }
 
 static void
-cachestats(void)
+cachestats(GCStats *stats)
 {
 	M *m;
 	MCache *c;
 	uint32 i;
 	uint64 stacks_inuse;
 	uint64 stacks_sys;
+	uint64 *src, *dst;
 
+	if(stats)
+		runtime_memclr((byte*)stats, sizeof(*stats));
 	stacks_inuse = 0;
 	stacks_sys = runtime_stacks_sys;
 	for(m=runtime_allm; m; m=m->alllink) {
-		runtime_purgecachedstats(m);
+		c = m->mcache;
+		runtime_purgecachedstats(c);
 		// stacks_inuse += m->stackalloc->inuse;
 		// stacks_sys += m->stackalloc->sys;
-		c = m->mcache;
+		if(stats) {
+			src = (uint64*)&m->gcstats;
+			dst = (uint64*)stats;
+			for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
+				dst[i] += src[i];
+			runtime_memclr((byte*)&m->gcstats, sizeof(m->gcstats));
+		}
 		for(i=0; i<nelem(c->local_by_size); i++) {
 			mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc;
 			c->local_by_size[i].nmalloc = 0;
@@ -945,7 +957,15 @@ 
 	int64 t0, t1, t2, t3;
 	uint64 heap0, heap1, obj0, obj1;
 	const byte *p;
-	bool extra;
+	GCStats stats;
+	M *m1;
+	uint32 i;
+
+	// The atomic operations are not atomic if the uint64s
+	// are not aligned on uint64 boundaries. This has been
+	// a problem in the past.
+	if((((uintptr)&work.empty) & 7) != 0)
+		runtime_throw("runtime: gc work buffer is misaligned");
 
 	// Make sure all registers are saved on stack so that
 	// scanstack sees them.
@@ -986,48 +1006,67 @@ 
 	}
 
 	t0 = runtime_nanotime();
-	nhandoff = 0;
 
 	m->gcing = 1;
 	runtime_stoptheworld();
 
-	cachestats();
-	heap0 = mstats.heap_alloc;
-	obj0 = mstats.nmalloc - mstats.nfree;
+	for(m1=runtime_allm; m1; m1=m1->alllink)
+		runtime_settype_flush(m1, false);
 
-	runtime_lock(&work.markgate);
-	runtime_lock(&work.sweepgate);
+	heap0 = 0;
+	obj0 = 0;
+	if(gctrace) {
+		cachestats(nil);
+		heap0 = mstats.heap_alloc;
+		obj0 = mstats.nmalloc - mstats.nfree;
+	}
 
-	extra = false;
-	work.nproc = 1;
-	if(runtime_gomaxprocs > 1 && runtime_ncpu > 1) {
-		runtime_noteclear(&work.alldone);
-		work.nproc += runtime_helpgc(&extra);
-	}
 	work.nwait = 0;
 	work.ndone = 0;
+	work.debugmarkdone = 0;
+	work.nproc = runtime_gcprocs();
+	addroots();
+	m->locks++;	// disable gc during mallocs in parforalloc
+	if(work.markfor == nil)
+		work.markfor = runtime_parforalloc(MaxGcproc);
+	runtime_parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
+	if(work.sweepfor == nil)
+		work.sweepfor = runtime_parforalloc(MaxGcproc);
+	runtime_parforsetup(work.sweepfor, work.nproc, runtime_mheap.nspan, nil, true, sweepspan);
+	m->locks--;
+	if(work.nproc > 1) {
+		runtime_noteclear(&work.alldone);
+		runtime_helpgc(work.nproc);
+	}
 
-	runtime_unlock(&work.markgate);  // let the helpers in
-	mark(scanblock);
-	if(DebugMark)
-		mark(debug_scanblock);
+	runtime_parfordo(work.markfor);
+	scanblock(nil, 0);
+
+	if(DebugMark) {
+		for(i=0; i<work.nroot; i++)
+			debug_scanblock(work.roots[i].p, work.roots[i].n);
+		runtime_atomicstore(&work.debugmarkdone, 1);
+	}
 	t1 = runtime_nanotime();
 
-	work.spans = runtime_mheap.allspans;
-	runtime_unlock(&work.sweepgate);  // let the helpers in
-	sweep();
-	if(work.nproc > 1)
-		runtime_notesleep(&work.alldone);
+	runtime_parfordo(work.sweepfor);
 	t2 = runtime_nanotime();
 
 	stealcache();
-	cachestats();
+	cachestats(&stats);
+
+	if(work.nproc > 1)
+		runtime_notesleep(&work.alldone);
+
+	stats.nprocyield += work.sweepfor->nprocyield;
+	stats.nosyield += work.sweepfor->nosyield;
+	stats.nsleep += work.sweepfor->nsleep;
 
 	mstats.next_gc = mstats.heap_alloc+(mstats.heap_alloc-runtime_stacks_sys)*gcpercent/100;
 	m->gcing = 0;
 
-	m->locks++;	// disable gc during the mallocs in newproc
 	if(finq != nil) {
+		m->locks++;	// disable gc during the mallocs in newproc
 		// kick off or wake up goroutine to run queued finalizers
 		if(fing == nil)
 			fing = __go_go(runfinq, nil);
@@ -1035,10 +1074,9 @@ 
 			fingwait = 0;
 			runtime_ready(fing);
 		}
+		m->locks--;
 	}
-	m->locks--;
 
-	cachestats();
 	heap1 = mstats.heap_alloc;
 	obj1 = mstats.nmalloc - mstats.nfree;
 
@@ -1051,26 +1089,22 @@ 
 		runtime_printf("pause %D\n", t3-t0);
 
 	if(gctrace) {
-		runtime_printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects\n",
+		runtime_printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
+				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
 			mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000,
 			heap0>>20, heap1>>20, obj0, obj1,
-			mstats.nmalloc, mstats.nfree);
+			mstats.nmalloc, mstats.nfree,
+			stats.nhandoff, stats.nhandoffcnt,
+			work.sweepfor->nsteal, work.sweepfor->nstealcnt,
+			stats.nprocyield, stats.nosyield, stats.nsleep);
 	}
-	
+
 	runtime_MProf_GC();
 	runtime_semrelease(&runtime_worldsema);
+	runtime_starttheworld();
 
-	// If we could have used another helper proc, start one now,
-	// in the hope that it will be available next time.
-	// It would have been even better to start it before the collection,
-	// but doing so requires allocating memory, so it's tricky to
-	// coordinate.  This lazy approach works out in practice:
-	// we don't mind if the first couple gc rounds don't have quite
-	// the maximum number of procs.
-	runtime_starttheworld(extra);
-
-	// give the queued finalizers, if any, a chance to run	
-	if(finq != nil)	
+	// give the queued finalizers, if any, a chance to run
+	if(finq != nil)
 		runtime_gosched();
 
 	if(gctrace > 1 && !force)
@@ -1093,22 +1127,23 @@ 
 	m = runtime_m();
 	m->gcing = 1;
 	runtime_stoptheworld();
-	cachestats();
+	cachestats(nil);
 	*stats = mstats;
 	m->gcing = 0;
 	runtime_semrelease(&runtime_worldsema);
-	runtime_starttheworld(false);
+	runtime_starttheworld();
 }
 
 static void
 runfinq(void* dummy __attribute__ ((unused)))
 {
-	G* gp;
 	Finalizer *f;
 	FinBlock *fb, *next;
 	uint32 i;
 
-	gp = runtime_g();
+	if(raceenabled)
+		runtime_racefingo();
+
 	for(;;) {
 		// There's no need for a lock in this section
 		// because it only conflicts with the garbage
@@ -1120,9 +1155,7 @@ 
 		finq = nil;
 		if(fb == nil) {
 			fingwait = 1;
-			gp->status = Gwaiting;
-			gp->waitreason = "finalizer wait";
-			runtime_gosched();
+			runtime_park(nil, nil, "finalizer wait");
 			continue;
 		}
 		for(; fb; fb=next) {
diff -r bf12a7f41b67 libgo/runtime/mheap.c
--- a/libgo/runtime/mheap.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/mheap.c	Mon Oct 22 17:36:23 2012 -0700
@@ -27,11 +27,24 @@ 
 {
 	MHeap *h;
 	MSpan *s;
+	MSpan **all;
+	uint32 cap;
 
 	h = vh;
 	s = (MSpan*)p;
-	s->allnext = h->allspans;
-	h->allspans = s;
+	if(h->nspan >= h->nspancap) {
+		cap = 64*1024/sizeof(all[0]);
+		if(cap < h->nspancap*3/2)
+			cap = h->nspancap*3/2;
+		all = (MSpan**)runtime_SysAlloc(cap*sizeof(all[0]));
+		if(h->allspans) {
+			runtime_memmove(all, h->allspans, h->nspancap*sizeof(all[0]));
+			runtime_SysFree(h->allspans, h->nspancap*sizeof(all[0]));
+		}
+		h->allspans = all;
+		h->nspancap = cap;
+	}
+	h->allspans[h->nspan++] = s;
 }
 
 // Initialize the heap; fetch memory using alloc.
@@ -53,12 +66,12 @@ 
 // Allocate a new span of npage pages from the heap
 // and record its size class in the HeapMap and HeapMapCache.
 MSpan*
-runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct)
+runtime_MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed)
 {
 	MSpan *s;
 
 	runtime_lock(h);
-	runtime_purgecachedstats(runtime_m());
+	runtime_purgecachedstats(runtime_m()->mcache);
 	s = MHeap_AllocLocked(h, npage, sizeclass);
 	if(s != nil) {
 		mstats.heap_inuse += npage<<PageShift;
@@ -68,6 +81,8 @@ 
 		}
 	}
 	runtime_unlock(h);
+	if(s != nil && *(uintptr*)(s->start<<PageShift) != 0 && zeroed)
+		runtime_memclr((byte*)(s->start<<PageShift), s->npages<<PageShift);
 	return s;
 }
 
@@ -125,12 +140,11 @@ 
 		MHeap_FreeLocked(h, t);
 	}
 
-	if(*(uintptr*)(s->start<<PageShift) != 0)
-		runtime_memclr((byte*)(s->start<<PageShift), s->npages<<PageShift);
-
 	// Record span info, because gc needs to be
 	// able to map interior pointer to containing span.
 	s->sizeclass = sizeclass;
+	s->elemsize = (sizeclass==0 ? s->npages<<PageShift : (uintptr)runtime_class_to_size[sizeclass]);
+	s->types.compression = MTypes_Empty;
 	p = s->start;
 	if(sizeof(void*) == 8)
 		p -= ((uintptr)h->arena_start>>PageShift);
@@ -259,7 +273,7 @@ 
 runtime_MHeap_Free(MHeap *h, MSpan *s, int32 acct)
 {
 	runtime_lock(h);
-	runtime_purgecachedstats(runtime_m());
+	runtime_purgecachedstats(runtime_m()->mcache);
 	mstats.heap_inuse -= s->npages<<PageShift;
 	if(acct) {
 		mstats.heap_alloc -= s->npages<<PageShift;
@@ -276,6 +290,10 @@ 
 	MSpan *t;
 	PageID p;
 
+	if(s->types.sysalloc)
+		runtime_settype_sysfree(s);
+	s->types.compression = MTypes_Empty;
+
 	if(s->state != MSpanInUse || s->ref != 0) {
 		runtime_printf("MHeap_FreeLocked - span %p ptr %p state %d ref %d\n", s, s->start<<PageShift, s->state, s->ref);
 		runtime_throw("MHeap_FreeLocked - invalid free");
@@ -416,9 +434,11 @@ 
 	span->freelist = nil;
 	span->ref = 0;
 	span->sizeclass = 0;
+	span->elemsize = 0;
 	span->state = 0;
 	span->unusedsince = 0;
 	span->npreleased = 0;
+	span->types.compression = MTypes_Empty;
 }
 
 // Initialize an empty doubly-linked list.
diff -r bf12a7f41b67 libgo/runtime/mprof.goc
--- a/libgo/runtime/mprof.goc	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/mprof.goc	Mon Oct 22 17:36:23 2012 -0700
@@ -15,21 +15,35 @@ 
 // NOTE(rsc): Everything here could use cas if contention became an issue.
 static Lock proflock;
 
-// Per-call-stack allocation information.
+enum { MProf, BProf };  // profile types
+
+// Per-call-stack profiling information.
 // Lookup by hashing call stack into a linked-list hash table.
 typedef struct Bucket Bucket;
 struct Bucket
 {
 	Bucket	*next;	// next in hash list
-	Bucket	*allnext;	// next in list of all buckets
-	uintptr	allocs;
-	uintptr	frees;
-	uintptr	alloc_bytes;
-	uintptr	free_bytes;
-	uintptr	recent_allocs;  // since last gc
-	uintptr	recent_frees;
-	uintptr	recent_alloc_bytes;
-	uintptr	recent_free_bytes;
+	Bucket	*allnext;	// next in list of all mbuckets/bbuckets
+	int32	typ;
+	union
+	{
+		struct  // typ == MProf
+		{
+			uintptr	allocs;
+			uintptr	frees;
+			uintptr	alloc_bytes;
+			uintptr	free_bytes;
+			uintptr	recent_allocs;  // since last gc
+			uintptr	recent_frees;
+			uintptr	recent_alloc_bytes;
+			uintptr	recent_free_bytes;
+		};
+		struct  // typ == BProf
+		{
+			int64	count;
+			int64	cycles;
+		};
+	};
 	uintptr	hash;
 	uintptr	nstk;
 	uintptr	stk[1];
@@ -38,12 +52,13 @@ 
 	BuckHashSize = 179999,
 };
 static Bucket **buckhash;
-static Bucket *buckets;
+static Bucket *mbuckets;  // memory profile buckets
+static Bucket *bbuckets;  // blocking profile buckets
 static uintptr bucketmem;
 
 // Return the bucket for stk[0:nstk], allocating new bucket if needed.
 static Bucket*
-stkbucket(uintptr *stk, int32 nstk, bool alloc)
+stkbucket(int32 typ, uintptr *stk, int32 nstk, bool alloc)
 {
 	int32 i;
 	uintptr h;
@@ -66,7 +81,7 @@ 
 
 	i = h%BuckHashSize;
 	for(b = buckhash[i]; b; b=b->next)
-		if(b->hash == h && b->nstk == (uintptr)nstk &&
+		if(b->typ == typ && b->hash == h && b->nstk == (uintptr)nstk &&
 		   runtime_mcmp((byte*)b->stk, (byte*)stk, nstk*sizeof stk[0]) == 0)
 			return b;
 
@@ -76,12 +91,18 @@ 
 	b = runtime_mallocgc(sizeof *b + nstk*sizeof stk[0], FlagNoProfiling, 0, 1);
 	bucketmem += sizeof *b + nstk*sizeof stk[0];
 	runtime_memmove(b->stk, stk, nstk*sizeof stk[0]);
+	b->typ = typ;
 	b->hash = h;
 	b->nstk = nstk;
 	b->next = buckhash[i];
 	buckhash[i] = b;
-	b->allnext = buckets;
-	buckets = b;
+	if(typ == MProf) {
+		b->allnext = mbuckets;
+		mbuckets = b;
+	} else {
+		b->allnext = bbuckets;
+		bbuckets = b;
+	}
 	return b;
 }
 
@@ -92,7 +113,7 @@ 
 	Bucket *b;
 	
 	runtime_lock(&proflock);
-	for(b=buckets; b; b=b->allnext) {
+	for(b=mbuckets; b; b=b->allnext) {
 		b->allocs += b->recent_allocs;
 		b->frees += b->recent_frees;
 		b->alloc_bytes += b->recent_alloc_bytes;
@@ -107,20 +128,26 @@ 
 
 // Map from pointer to Bucket* that allocated it.
 // Three levels:
-//	Linked-list hash table for top N-20 bits.
-//	Array index for next 13 bits.
-//	Linked list for next 7 bits.
+//	Linked-list hash table for top N-AddrHashShift bits.
+//	Array index for next AddrDenseBits bits.
+//	Linked list for next AddrHashShift-AddrDenseBits bits.
 // This is more efficient than using a general map,
 // because of the typical clustering of the pointer keys.
 
 typedef struct AddrHash AddrHash;
 typedef struct AddrEntry AddrEntry;
 
+enum {
+	AddrHashBits = 12,	// good for 4GB of used address space
+	AddrHashShift = 20,	// each AddrHash knows about 1MB of address space
+	AddrDenseBits = 8,	// good for a profiling rate of 4096 bytes
+};
+
 struct AddrHash
 {
 	AddrHash *next;	// next in top-level hash table linked list
 	uintptr addr;	// addr>>20
-	AddrEntry *dense[1<<13];
+	AddrEntry *dense[1<<AddrDenseBits];
 };
 
 struct AddrEntry
@@ -130,9 +157,6 @@ 
 	Bucket *b;
 };
 
-enum {
-	AddrHashBits = 12	// 1MB per entry, so good for 4GB of used address space
-};
 static AddrHash *addrhash[1<<AddrHashBits];
 static AddrEntry *addrfree;
 static uintptr addrmem;
@@ -155,15 +179,15 @@ 
 	AddrHash *ah;
 	AddrEntry *e;
 
-	h = (uint32)((addr>>20)*HashMultiplier) >> (32-AddrHashBits);
+	h = (uint32)((addr>>AddrHashShift)*HashMultiplier) >> (32-AddrHashBits);
 	for(ah=addrhash[h]; ah; ah=ah->next)
-		if(ah->addr == (addr>>20))
+		if(ah->addr == (addr>>AddrHashShift))
 			goto found;
 
 	ah = runtime_mallocgc(sizeof *ah, FlagNoProfiling, 0, 1);
 	addrmem += sizeof *ah;
 	ah->next = addrhash[h];
-	ah->addr = addr>>20;
+	ah->addr = addr>>AddrHashShift;
 	addrhash[h] = ah;
 
 found:
@@ -175,9 +199,9 @@ 
 		e[63].next = nil;
 	}
 	addrfree = e->next;
-	e->addr = (uint32)~(addr & ((1<<20)-1));
+	e->addr = (uint32)~(addr & ((1<<AddrHashShift)-1));
 	e->b = b;
-	h = (addr>>7)&(nelem(ah->dense)-1);	// entry in dense is top 13 bits of low 20.
+	h = (addr>>(AddrHashShift-AddrDenseBits))&(nelem(ah->dense)-1);	// entry in dense is top 8 bits of low 20.
 	e->next = ah->dense[h];
 	ah->dense[h] = e;
 }
@@ -191,16 +215,16 @@ 
 	AddrEntry *e, **l;
 	Bucket *b;
 
-	h = (uint32)((addr>>20)*HashMultiplier) >> (32-AddrHashBits);
+	h = (uint32)((addr>>AddrHashShift)*HashMultiplier) >> (32-AddrHashBits);
 	for(ah=addrhash[h]; ah; ah=ah->next)
-		if(ah->addr == (addr>>20))
+		if(ah->addr == (addr>>AddrHashShift))
 			goto found;
 	return nil;
 
 found:
-	h = (addr>>7)&(nelem(ah->dense)-1);	// entry in dense is top 13 bits of low 20.
+	h = (addr>>(AddrHashShift-AddrDenseBits))&(nelem(ah->dense)-1);	// entry in dense is top 8 bits of low 20.
 	for(l=&ah->dense[h]; (e=*l) != nil; l=&e->next) {
-		if(e->addr == (uint32)~(addr & ((1<<20)-1))) {
+		if(e->addr == (uint32)~(addr & ((1<<AddrHashShift)-1))) {
 			*l = e->next;
 			b = e->b;
 			e->next = addrfree;
@@ -227,7 +251,7 @@ 
 	m->nomemprof++;
 	nstk = runtime_callers(1, stk, 32);
 	runtime_lock(&proflock);
-	b = stkbucket(stk, nstk, true);
+	b = stkbucket(MProf, stk, nstk, true);
 	b->recent_allocs++;
 	b->recent_alloc_bytes += size;
 	setaddrbucket((uintptr)p, b);
@@ -259,6 +283,37 @@ 
 	m->nomemprof--;
 }
 
+int64 runtime_blockprofilerate;  // in CPU ticks
+
+void runtime_SetBlockProfileRate(intgo) asm("runtime.SetBlockProfileRate");
+
+void
+runtime_SetBlockProfileRate(intgo rate)
+{
+	runtime_atomicstore64((uint64*)&runtime_blockprofilerate, rate * runtime_tickspersecond() / (1000*1000*1000));
+}
+
+void
+runtime_blockevent(int64 cycles, int32 skip)
+{
+	int32 nstk;
+	int64 rate;
+	uintptr stk[32];
+	Bucket *b;
+
+	if(cycles <= 0)
+		return;
+	rate = runtime_atomicload64((uint64*)&runtime_blockprofilerate);
+	if(rate <= 0 || (rate > cycles && runtime_fastrand1()%rate > cycles))
+		return;
+
+	nstk = runtime_callers(skip, stk, 32);
+	runtime_lock(&proflock);
+	b = stkbucket(BProf, stk, nstk, true);
+	b->count++;
+	b->cycles += cycles;
+	runtime_unlock(&proflock);
+}
 
 // Go interface to profile data.  (Declared in extern.go)
 // Assumes Go sizeof(int) == sizeof(int32)
@@ -287,20 +342,20 @@ 
 		r->stk[i] = 0;
 }
 
-func MemProfile(p Slice, include_inuse_zero bool) (n int32, ok bool) {
+func MemProfile(p Slice, include_inuse_zero bool) (n int, ok bool) {
 	Bucket *b;
 	Record *r;
 
 	runtime_lock(&proflock);
 	n = 0;
-	for(b=buckets; b; b=b->allnext)
+	for(b=mbuckets; b; b=b->allnext)
 		if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
 			n++;
 	ok = false;
 	if(n <= p.__count) {
 		ok = true;
 		r = (Record*)p.__values;
-		for(b=buckets; b; b=b->allnext)
+		for(b=mbuckets; b; b=b->allnext)
 			if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
 				record(r++, b);
 	}
@@ -308,12 +363,46 @@ 
 }
 
 void
-runtime_MProf_Mark(void (*scan)(byte *, int64))
+runtime_MProf_Mark(void (*addroot)(byte *, uintptr))
 {
 	// buckhash is not allocated via mallocgc.
-	scan((byte*)&buckets, sizeof buckets);
-	scan((byte*)&addrhash, sizeof addrhash);
-	scan((byte*)&addrfree, sizeof addrfree);
+	addroot((byte*)&mbuckets, sizeof mbuckets);
+	addroot((byte*)&bbuckets, sizeof bbuckets);
+	addroot((byte*)&addrhash, sizeof addrhash);
+	addroot((byte*)&addrfree, sizeof addrfree);
+}
+
+// Must match BlockProfileRecord in debug.go.
+typedef struct BRecord BRecord;
+struct BRecord {
+	int64 count;
+	int64 cycles;
+	uintptr stk[32];
+};
+
+func BlockProfile(p Slice) (n int, ok bool) {
+	Bucket *b;
+	BRecord *r;
+	int32 i;
+
+	runtime_lock(&proflock);
+	n = 0;
+	for(b=bbuckets; b; b=b->allnext)
+		n++;
+	ok = false;
+	if(n <= p.__count) {
+		ok = true;
+		r = (BRecord*)p.__values;
+		for(b=bbuckets; b; b=b->allnext, r++) {
+			r->count = b->count;
+			r->cycles = b->cycles;
+			for(i=0; (uintptr)i<b->nstk && (uintptr)i<nelem(r->stk); i++)
+				r->stk[i] = b->stk[i];
+			for(; (uintptr)i<nelem(r->stk); i++)
+				r->stk[i] = 0;			
+		}
+	}
+	runtime_unlock(&proflock);
 }
 
 // Must match StackRecord in debug.go.
@@ -322,7 +411,7 @@ 
 	uintptr stk[32];
 };
 
-func ThreadCreateProfile(p Slice) (n int32, ok bool) {
+func ThreadCreateProfile(p Slice) (n int, ok bool) {
 	TRecord *r;
 	M *first, *m;
 	
@@ -341,7 +430,7 @@ 
 	}
 }
 
-func Stack(b Slice, all bool) (n int32) {
+func Stack(b Slice, all bool) (n int) {
 	byte *pc, *sp;
 	bool enablegc;
 	
@@ -378,7 +467,7 @@ 
 		runtime_m()->gcing = 0;
 		mstats.enablegc = enablegc;
 		runtime_semrelease(&runtime_worldsema);
-		runtime_starttheworld(false);
+		runtime_starttheworld();
 	}
 }
 
@@ -397,7 +486,7 @@ 
 		r->stk[n] = 0;
 }
 
-func GoroutineProfile(b Slice) (n int32, ok bool) {
+func GoroutineProfile(b Slice) (n int, ok bool) {
 	TRecord *r;
 	G *gp;
 	
@@ -423,7 +512,7 @@ 
 	
 		runtime_m()->gcing = 0;
 		runtime_semrelease(&runtime_worldsema);
-		runtime_starttheworld(false);
+		runtime_starttheworld();
 	}
 }
 
diff -r bf12a7f41b67 libgo/runtime/panic.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libgo/runtime/panic.c	Mon Oct 22 17:36:23 2012 -0700
@@ -0,0 +1,115 @@ 
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "go-defer.h"
+
+// Code related to defer, panic and recover.
+
+uint32 runtime_panicking;
+static Lock paniclk;
+
+// Run all deferred functions for the current goroutine.
+static void
+rundefer(void)
+{
+	G *g;
+	Defer *d;
+
+	g = runtime_g();
+	while((d = g->defer) != nil) {
+		void (*pfn)(void*);
+
+		g->defer = d->__next;
+		pfn = d->__pfn;
+		d->__pfn = nil;
+		if (pfn != nil)
+			(*pfn)(d->__arg);
+		runtime_free(d);
+	}
+}
+
+void
+runtime_startpanic(void)
+{
+	M *m;
+
+	m = runtime_m();
+	if(m->dying) {
+		runtime_printf("panic during panic\n");
+		runtime_exit(3);
+	}
+	m->dying = 1;
+	runtime_xadd(&runtime_panicking, 1);
+	runtime_lock(&paniclk);
+}
+
+void
+runtime_dopanic(int32 unused __attribute__ ((unused)))
+{
+	G *g;
+	static bool didothers;
+
+	g = runtime_g();
+	if(g->sig != 0)
+		runtime_printf("[signal %x code=%p addr=%p]\n",
+			       g->sig, (void*)g->sigcode0, (void*)g->sigcode1);
+
+	if(runtime_gotraceback()){
+		if(g != runtime_m()->g0) {
+			runtime_printf("\n");
+			runtime_goroutineheader(g);
+			runtime_traceback();
+			runtime_goroutinetrailer(g);
+		}
+		if(!didothers) {
+			didothers = true;
+			runtime_tracebackothers(g);
+		}
+	}
+	runtime_unlock(&paniclk);
+	if(runtime_xadd(&runtime_panicking, -1) != 0) {
+		// Some other m is panicking too.
+		// Let it print what it needs to print.
+		// Wait forever without chewing up cpu.
+		// It will exit when it's done.
+		static Lock deadlock;
+		runtime_lock(&deadlock);
+		runtime_lock(&deadlock);
+	}
+
+	runtime_exit(2);
+}
+
+void
+runtime_throw(const char *s)
+{
+	runtime_startpanic();
+	runtime_printf("throw: %s\n", s);
+	runtime_dopanic(0);
+	*(int32*)0 = 0;	// not reached
+	runtime_exit(1);	// even more not reached
+}
+
+void
+runtime_panicstring(const char *s)
+{
+	Eface err;
+
+	if(runtime_m()->gcing) {
+		runtime_printf("panic: %s\n", s);
+		runtime_throw("panic during gc");
+	}
+	runtime_newErrorString(runtime_gostringnocopy((const byte*)s), &err);
+	runtime_panic(err);
+}
+
+void runtime_Goexit (void) asm ("runtime.Goexit");
+
+void
+runtime_Goexit(void)
+{
+	rundefer();
+	runtime_goexit();
+}
diff -r bf12a7f41b67 libgo/runtime/parfor.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libgo/runtime/parfor.c	Mon Oct 22 17:36:23 2012 -0700
@@ -0,0 +1,232 @@ 
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Parallel for algorithm.
+
+#include "runtime.h"
+#include "arch.h"
+
+struct ParForThread
+{
+	// the thread's iteration space [32lsb, 32msb)
+	uint64 pos;
+	// stats
+	uint64 nsteal;
+	uint64 nstealcnt;
+	uint64 nprocyield;
+	uint64 nosyield;
+	uint64 nsleep;
+	byte pad[CacheLineSize];
+};
+
+ParFor*
+runtime_parforalloc(uint32 nthrmax)
+{
+	ParFor *desc;
+
+	// The ParFor object is followed by CacheLineSize padding
+	// and then nthrmax ParForThread.
+	desc = (ParFor*)runtime_malloc(sizeof(ParFor) + CacheLineSize + nthrmax * sizeof(ParForThread));
+	desc->thr = (ParForThread*)((byte*)(desc+1) + CacheLineSize);
+	desc->nthrmax = nthrmax;
+	return desc;
+}
+
+// For testing from Go
+// func parforalloc2(nthrmax uint32) *ParFor
+
+ParFor *runtime_parforalloc2(uint32)
+   asm("runtime.parforalloc2");
+
+ParFor *
+runtime_parforalloc2(uint32 nthrmax)
+{
+	return runtime_parforalloc(nthrmax);
+}
+
+void
+runtime_parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void (*body)(ParFor*, uint32))
+{
+	uint32 i, begin, end;
+
+	if(desc == nil || nthr == 0 || nthr > desc->nthrmax || body == nil) {
+		runtime_printf("desc=%p nthr=%d count=%d body=%p\n", desc, nthr, n, body);
+		runtime_throw("parfor: invalid args");
+	}
+
+	desc->body = body;
+	desc->done = 0;
+	desc->nthr = nthr;
+	desc->thrseq = 0;
+	desc->cnt = n;
+	desc->ctx = ctx;
+	desc->wait = wait;
+	desc->nsteal = 0;
+	desc->nstealcnt = 0;
+	desc->nprocyield = 0;
+	desc->nosyield = 0;
+	desc->nsleep = 0;
+	for(i=0; i<nthr; i++) {
+		begin = (uint64)n*i / nthr;
+		end = (uint64)n*(i+1) / nthr;
+		desc->thr[i].pos = (uint64)begin | (((uint64)end)<<32);
+	}
+}
+
+// For testing from Go
+// func parforsetup2(desc *ParFor, nthr, n uint32, ctx *byte, wait bool, body func(*ParFor, uint32))
+
+void runtime_parforsetup2(ParFor *, uint32, uint32, void *, bool, void *)
+  asm("runtime.parforsetup2");
+
+void
+runtime_parforsetup2(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void *body)
+{
+	runtime_parforsetup(desc, nthr, n, ctx, wait, (void(*)(ParFor*, uint32))body);
+}
+
+void
+runtime_parfordo(ParFor *desc)
+{
+	ParForThread *me;
+	uint32 tid, begin, end, begin2, try, victim, i;
+	uint64 *mypos, *victimpos, pos, newpos;
+	void (*body)(ParFor*, uint32);
+	bool idle;
+
+	// Obtain 0-based thread index.
+	tid = runtime_xadd(&desc->thrseq, 1) - 1;
+	if(tid >= desc->nthr) {
+		runtime_printf("tid=%d nthr=%d\n", tid, desc->nthr);
+		runtime_throw("parfor: invalid tid");
+	}
+
+	// If single-threaded, just execute the for serially.
+	if(desc->nthr==1) {
+		for(i=0; i<desc->cnt; i++)
+			desc->body(desc, i);
+		return;
+	}
+
+	body = desc->body;
+	me = &desc->thr[tid];
+	mypos = &me->pos;
+	for(;;) {
+		for(;;) {
+			// While there is local work,
+			// bump low index and execute the iteration.
+			pos = runtime_xadd64(mypos, 1);
+			begin = (uint32)pos-1;
+			end = (uint32)(pos>>32);
+			if(begin < end) {
+				body(desc, begin);
+				continue;
+			}
+			break;
+		}
+
+		// Out of work, need to steal something.
+		idle = false;
+		for(try=0;; try++) {
+			// If we don't see any work for long enough,
+			// increment the done counter...
+			if(try > desc->nthr*4 && !idle) {
+				idle = true;
+				runtime_xadd(&desc->done, 1);
+			}
+			// ...if all threads have incremented the counter,
+			// we are done.
+			if(desc->done + !idle == desc->nthr) {
+				if(!idle)
+					runtime_xadd(&desc->done, 1);
+				goto exit;
+			}
+			// Choose a random victim for stealing.
+			victim = runtime_fastrand1() % (desc->nthr-1);
+			if(victim >= tid)
+				victim++;
+			victimpos = &desc->thr[victim].pos;
+			pos = runtime_atomicload64(victimpos);
+			for(;;) {
+				// See if it has any work.
+				begin = (uint32)pos;
+				end = (uint32)(pos>>32);
+				if(begin >= end-1) {
+					begin = end = 0;
+					break;
+				}
+				if(idle) {
+					runtime_xadd(&desc->done, -1);
+					idle = false;
+				}
+				begin2 = begin + (end-begin)/2;
+				newpos = (uint64)begin | (uint64)begin2<<32;
+				if(runtime_cas64(victimpos, &pos, newpos)) {
+					begin = begin2;
+					break;
+				}
+			}
+			if(begin < end) {
+				// Has successfully stolen some work.
+				if(idle)
+					runtime_throw("parfor: should not be idle");
+				runtime_atomicstore64(mypos, (uint64)begin | (uint64)end<<32);
+				me->nsteal++;
+				me->nstealcnt += end-begin;
+				break;
+			}
+			// Backoff.
+			if(try < desc->nthr) {
+				// nothing
+			} else if (try < 4*desc->nthr) {
+				me->nprocyield++;
+				runtime_procyield(20);
+			// If a caller asked not to wait for the others, exit now
+			// (assume that most work is already done at this point).
+			} else if (!desc->wait) {
+				if(!idle)
+					runtime_xadd(&desc->done, 1);
+				goto exit;
+			} else if (try < 6*desc->nthr) {
+				me->nosyield++;
+				runtime_osyield();
+			} else {
+				me->nsleep++;
+				runtime_usleep(1);
+			}
+		}
+	}
+exit:
+	runtime_xadd64(&desc->nsteal, me->nsteal);
+	runtime_xadd64(&desc->nstealcnt, me->nstealcnt);
+	runtime_xadd64(&desc->nprocyield, me->nprocyield);
+	runtime_xadd64(&desc->nosyield, me->nosyield);
+	runtime_xadd64(&desc->nsleep, me->nsleep);
+	me->nsteal = 0;
+	me->nstealcnt = 0;
+	me->nprocyield = 0;
+	me->nosyield = 0;
+	me->nsleep = 0;
+}
+
+// For testing from Go
+// func parforiters(desc *ParFor, tid uintptr) (uintptr, uintptr)
+
+struct parforiters_ret {
+  uintptr start;
+  uintptr end;
+};
+
+struct parforiters_ret runtime_parforiters(ParFor *, uintptr)
+  asm("runtime.parforiters");
+
+struct parforiters_ret
+runtime_parforiters(ParFor *desc, uintptr tid)
+{
+	struct parforiters_ret ret;
+
+	ret.start = (uint32)desc->thr[tid].pos;
+	ret.end = (uint32)(desc->thr[tid].pos>>32);
+	return ret;
+}
diff -r bf12a7f41b67 libgo/runtime/print.c
--- a/libgo/runtime/print.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/print.c	Mon Oct 22 17:36:23 2012 -0700
@@ -156,15 +156,16 @@ 
 	int32 e, s, i, n;
 	float64 h;
 
-	if(runtime_isNaN(v)) {
+	if(ISNAN(v)) {
 		gwrite("NaN", 3);
 		return;
 	}
-	if(runtime_isInf(v, 1)) {
+	i = __builtin_isinf_sign(v);
+	if(i > 0) {
 		gwrite("+Inf", 4);
 		return;
 	}
-	if(runtime_isInf(v, -1)) {
+	if(i < 0) {
 		gwrite("-Inf", 4);
 		return;
 	}
@@ -290,8 +291,8 @@ 
 	// extern uint32 runtime_maxstring;
 
 	// if(v.len > runtime_maxstring) {
-	// 	gwrite("[invalid string]", 16);
-	// 	return;
+	//	gwrite("[string too long]", 17);
+	//	return;
 	// }
 	if(v.__length > 0)
 		gwrite(v.__data, v.__length);
diff -r bf12a7f41b67 libgo/runtime/proc.c
--- a/libgo/runtime/proc.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/proc.c	Mon Oct 22 17:36:23 2012 -0700
@@ -17,6 +17,7 @@ 
 #include "arch.h"
 #include "defs.h"
 #include "malloc.h"
+#include "race.h"
 #include "go-defer.h"
 
 #ifdef USING_SPLIT_STACK
@@ -330,6 +331,9 @@ 
 {
 	void (*fn)(void*);
 
+	if(g->traceback != nil)
+		gtraceback(g);
+
 	fn = (void (*)(void*))(g->entry);
 	fn(g->param);
 	runtime_goexit();
@@ -471,6 +475,9 @@ 
 	// Can not enable GC until all roots are registered.
 	// mstats.enablegc = 1;
 	m->nomemprof--;
+
+	if(raceenabled)
+		runtime_raceinit();
 }
 
 extern void main_init(void) __asm__ ("__go_init_main");
@@ -507,6 +514,8 @@ 
 	runtime_gosched();
 
 	main_main();
+	if(raceenabled)
+		runtime_racefini();
 	runtime_exit(0);
 	for(;;)
 		*(int32*)0 = 0;
@@ -540,11 +549,11 @@ 
 }
 
 void
-runtime_goroutineheader(G *g)
+runtime_goroutineheader(G *gp)
 {
 	const char *status;
 
-	switch(g->status) {
+	switch(gp->status) {
 	case Gidle:
 		status = "idle";
 		break;
@@ -558,8 +567,8 @@ 
 		status = "syscall";
 		break;
 	case Gwaiting:
-		if(g->waitreason)
-			status = g->waitreason;
+		if(gp->waitreason)
+			status = gp->waitreason;
 		else
 			status = "waiting";
 		break;
@@ -570,7 +579,7 @@ 
 		status = "???";
 		break;
 	}
-	runtime_printf("goroutine %d [%s]:\n", g->goid, status);
+	runtime_printf("goroutine %d [%s]:\n", gp->goid, status);
 }
 
 void
@@ -598,15 +607,15 @@ 
 void
 runtime_tracebackothers(G * volatile me)
 {
-	G * volatile g;
+	G * volatile gp;
 	Traceback traceback;
 
 	traceback.gp = me;
-	for(g = runtime_allg; g != nil; g = g->alllink) {
-		if(g == me || g->status == Gdead)
+	for(gp = runtime_allg; gp != nil; gp = gp->alllink) {
+		if(gp == me || gp->status == Gdead)
 			continue;
 		runtime_printf("\n");
-		runtime_goroutineheader(g);
+		runtime_goroutineheader(gp);
 
 		// Our only mechanism for doing a stack trace is
 		// _Unwind_Backtrace.  And that only works for the
@@ -616,25 +625,25 @@ 
 
 		// This means that if g is running or in a syscall, we
 		// can't reliably print a stack trace.  FIXME.
-		if(g->status == Gsyscall || g->status == Grunning) {
+		if(gp->status == Gsyscall || gp->status == Grunning) {
 			runtime_printf("no stack trace available\n");
-			runtime_goroutinetrailer(g);
+			runtime_goroutinetrailer(gp);
 			continue;
 		}
 
-		g->traceback = &traceback;
+		gp->traceback = &traceback;
 
 #ifdef USING_SPLIT_STACK
 		__splitstack_getcontext(&me->stack_context[0]);
 #endif
 		getcontext(&me->context);
 
-		if(g->traceback != nil) {
-			runtime_gogo(g);
+		if(gp->traceback != nil) {
+			runtime_gogo(gp);
 		}
 
 		runtime_printtrace(traceback.pcbuf, traceback.c);
-		runtime_goroutinetrailer(g);
+		runtime_goroutinetrailer(gp);
 	}
 }
 
@@ -666,22 +675,22 @@ 
 }
 
 static void
-mcommoninit(M *m)
+mcommoninit(M *mp)
 {
-	m->id = runtime_sched.mcount++;
-	m->fastrand = 0x49f6428aUL + m->id + runtime_cputicks();
+	mp->id = runtime_sched.mcount++;
+	mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
 
-	if(m->mcache == nil)
-		m->mcache = runtime_allocmcache();
+	if(mp->mcache == nil)
+		mp->mcache = runtime_allocmcache();
 
-	runtime_callers(1, m->createstack, nelem(m->createstack));
+	runtime_callers(1, mp->createstack, nelem(mp->createstack));
 
 	// Add to runtime_allm so garbage collector doesn't free m
 	// when it is just in a register or thread-local storage.
-	m->alllink = runtime_allm;
+	mp->alllink = runtime_allm;
 	// runtime_NumCgoCall() iterates over allm w/o schedlock,
 	// so we need to publish it safely.
-	runtime_atomicstorep(&runtime_allm, m);
+	runtime_atomicstorep(&runtime_allm, mp);
 }
 
 // Try to increment mcpu.  Report whether succeeded.
@@ -701,34 +710,34 @@ 
 
 // Put on `g' queue.  Sched must be locked.
 static void
-gput(G *g)
+gput(G *gp)
 {
-	M *m;
+	M *mp;
 
 	// If g is wired, hand it off directly.
-	if((m = g->lockedm) != nil && canaddmcpu()) {
-		mnextg(m, g);
+	if((mp = gp->lockedm) != nil && canaddmcpu()) {
+		mnextg(mp, gp);
 		return;
 	}
 
 	// If g is the idle goroutine for an m, hand it off.
-	if(g->idlem != nil) {
-		if(g->idlem->idleg != nil) {
+	if(gp->idlem != nil) {
+		if(gp->idlem->idleg != nil) {
 			runtime_printf("m%d idle out of sync: g%d g%d\n",
-				g->idlem->id,
-				g->idlem->idleg->goid, g->goid);
+				gp->idlem->id,
+				gp->idlem->idleg->goid, gp->goid);
 			runtime_throw("runtime: double idle");
 		}
-		g->idlem->idleg = g;
+		gp->idlem->idleg = gp;
 		return;
 	}
 
-	g->schedlink = nil;
+	gp->schedlink = nil;
 	if(runtime_sched.ghead == nil)
-		runtime_sched.ghead = g;
+		runtime_sched.ghead = gp;
 	else
-		runtime_sched.gtail->schedlink = g;
-	runtime_sched.gtail = g;
+		runtime_sched.gtail->schedlink = gp;
+	runtime_sched.gtail = gp;
 
 	// increment gwait.
 	// if it transitions to nonzero, set atomic gwaiting bit.
@@ -747,11 +756,11 @@ 
 static G*
 gget(void)
 {
-	G *g;
+	G *gp;
 
-	g = runtime_sched.ghead;
-	if(g){
-		runtime_sched.ghead = g->schedlink;
+	gp = runtime_sched.ghead;
+	if(gp) {
+		runtime_sched.ghead = gp->schedlink;
 		if(runtime_sched.ghead == nil)
 			runtime_sched.gtail = nil;
 		// decrement gwait.
@@ -759,45 +768,45 @@ 
 		if(--runtime_sched.gwait == 0)
 			runtime_xadd(&runtime_sched.atomic, -1<<gwaitingShift);
 	} else if(m->idleg != nil) {
-		g = m->idleg;
+		gp = m->idleg;
 		m->idleg = nil;
 	}
-	return g;
+	return gp;
 }
 
 // Put on `m' list.  Sched must be locked.
 static void
-mput(M *m)
+mput(M *mp)
 {
-	m->schedlink = runtime_sched.mhead;
-	runtime_sched.mhead = m;
+	mp->schedlink = runtime_sched.mhead;
+	runtime_sched.mhead = mp;
 	runtime_sched.mwait++;
 }
 
 // Get an `m' to run `g'.  Sched must be locked.
 static M*
-mget(G *g)
+mget(G *gp)
 {
-	M *m;
+	M *mp;
 
 	// if g has its own m, use it.
-	if(g && (m = g->lockedm) != nil)
-		return m;
+	if(gp && (mp = gp->lockedm) != nil)
+		return mp;
 
 	// otherwise use general m pool.
-	if((m = runtime_sched.mhead) != nil){
-		runtime_sched.mhead = m->schedlink;
+	if((mp = runtime_sched.mhead) != nil) {
+		runtime_sched.mhead = mp->schedlink;
 		runtime_sched.mwait--;
 	}
-	return m;
+	return mp;
 }
 
 // Mark g ready to run.
 void
-runtime_ready(G *g)
+runtime_ready(G *gp)
 {
 	schedlock();
-	readylocked(g);
+	readylocked(gp);
 	schedunlock();
 }
 
@@ -805,23 +814,23 @@ 
 // G might be running already and about to stop.
 // The sched lock protects g->status from changing underfoot.
 static void
-readylocked(G *g)
+readylocked(G *gp)
 {
-	if(g->m){
+	if(gp->m) {
 		// Running on another machine.
 		// Ready it when it stops.
-		g->readyonstop = 1;
+		gp->readyonstop = 1;
 		return;
 	}
 
 	// Mark runnable.
-	if(g->status == Grunnable || g->status == Grunning) {
-		runtime_printf("goroutine %d has status %d\n", g->goid, g->status);
+	if(gp->status == Grunnable || gp->status == Grunning) {
+		runtime_printf("goroutine %d has status %d\n", gp->goid, gp->status);
 		runtime_throw("bad g->status in ready");
 	}
-	g->status = Grunnable;
+	gp->status = Grunnable;
 
-	gput(g);
+	gput(gp);
 	matchmg();
 }
 
@@ -829,23 +838,23 @@ 
 // debuggers can set a breakpoint here and catch all
 // new goroutines.
 static void
-newprocreadylocked(G *g)
+newprocreadylocked(G *gp)
 {
-	readylocked(g);
+	readylocked(gp);
 }
 
 // Pass g to m for running.
 // Caller has already incremented mcpu.
 static void
-mnextg(M *m, G *g)
+mnextg(M *mp, G *gp)
 {
 	runtime_sched.grunning++;
-	m->nextg = g;
-	if(m->waitnextg) {
-		m->waitnextg = 0;
+	mp->nextg = gp;
+	if(mp->waitnextg) {
+		mp->waitnextg = 0;
 		if(mwakeup != nil)
 			runtime_notewakeup(&mwakeup->havenextg);
-		mwakeup = m;
+		mwakeup = mp;
 	}
 }
 
@@ -969,35 +978,38 @@ 
 }
 
 int32
-runtime_helpgc(bool *extra)
+runtime_gcprocs(void)
+{
+	int32 n;
+	
+	// Figure out how many CPUs to use during GC.
+	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
+	n = runtime_gomaxprocs;
+	if(n > runtime_ncpu)
+		n = runtime_ncpu > 0 ? runtime_ncpu : 1;
+	if(n > MaxGcproc)
+		n = MaxGcproc;
+	if(n > runtime_sched.mwait+1) // one M is currently running
+		n = runtime_sched.mwait+1;
+	return n;
+}
+
+void
+runtime_helpgc(int32 nproc)
 {
 	M *mp;
-	int32 n, max;
-
-	// Figure out how many CPUs to use.
-	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
-	max = runtime_gomaxprocs;
-	if(max > runtime_ncpu)
-		max = runtime_ncpu > 0 ? runtime_ncpu : 1;
-	if(max > MaxGcproc)
-		max = MaxGcproc;
-
-	// We're going to use one CPU no matter what.
-	// Figure out the max number of additional CPUs.
-	max--;
+	int32 n;
 
 	runtime_lock(&runtime_sched);
-	n = 0;
-	while(n < max && (mp = mget(nil)) != nil) {
-		n++;
+	for(n = 1; n < nproc; n++) { // one M is currently running
+		mp = mget(nil);
+		if(mp == nil)
+			runtime_throw("runtime_gcprocs inconsistency");
 		mp->helpgc = 1;
 		mp->waitnextg = 0;
 		runtime_notewakeup(&mp->havenextg);
 	}
 	runtime_unlock(&runtime_sched);
-	if(extra)
-		*extra = n != max;
-	return n;
 }
 
 void
@@ -1037,26 +1049,38 @@ 
 }
 
 void
-runtime_starttheworld(bool extra)
+runtime_starttheworld(void)
 {
-	M *m;
+	M *mp;
+	int32 max;
+	
+	// Figure out how many CPUs GC could possibly use.
+	max = runtime_gomaxprocs;
+	if(max > runtime_ncpu)
+		max = runtime_ncpu > 0 ? runtime_ncpu : 1;
+	if(max > MaxGcproc)
+		max = MaxGcproc;
 
 	schedlock();
 	runtime_gcwaiting = 0;
 	setmcpumax(runtime_gomaxprocs);
 	matchmg();
-	if(extra && canaddmcpu()) {
-		// Start a new m that will (we hope) be idle
-		// and so available to help when the next
-		// garbage collection happens.
+	if(runtime_gcprocs() < max && canaddmcpu()) {
+		// If GC could have used another helper proc, start one now,
+		// in the hope that it will be available next time.
+		// It would have been even better to start it before the collection,
+		// but doing so requires allocating memory, so it's tricky to
+		// coordinate.  This lazy approach works out in practice:
+		// we don't mind if the first couple gc rounds don't have quite
+		// the maximum number of procs.
 		// canaddmcpu above did mcpu++
 		// (necessary, because m will be doing various
 		// initialization work so is definitely running),
 		// but m is not running a specific goroutine,
 		// so set the helpgc flag as a signal to m's
 		// first schedule(nil) to mcpu-- and grunning--.
-		m = runtime_newm();
-		m->helpgc = 1;
+		mp = runtime_newm();
+		mp->helpgc = 1;
 		runtime_sched.grunning++;
 	}
 	schedunlock();
@@ -1110,6 +1134,11 @@ 
 		runtime_initsig();
 
 	schedule(nil);
+
+	// TODO(brainman): This point is never reached, because scheduler
+	// does not release os threads at the moment. But once this path
+	// is enabled, we must remove our seh here.
+
 	return nil;
 }
 
@@ -1148,14 +1177,14 @@ 
 M*
 runtime_newm(void)
 {
-	M *m;
+	M *mp;
 	pthread_attr_t attr;
 	pthread_t tid;
 	size_t stacksize;
 
-	m = runtime_malloc(sizeof(M));
-	mcommoninit(m);
-	m->g0 = runtime_malg(-1, nil, nil);
+	mp = runtime_malloc(sizeof(M));
+	mcommoninit(mp);
+	mp->g0 = runtime_malg(-1, nil, nil);
 
 	if(pthread_attr_init(&attr) != 0)
 		runtime_throw("pthread_attr_init");
@@ -1175,10 +1204,10 @@ 
 	if(pthread_attr_setstacksize(&attr, stacksize) != 0)
 		runtime_throw("pthread_attr_setstacksize");
 
-	if(pthread_create(&tid, &attr, runtime_mstart, m) != 0)
+	if(pthread_create(&tid, &attr, runtime_mstart, mp) != 0)
 		runtime_throw("pthread_create");
 
-	return m;
+	return mp;
 }
 
 // One round of scheduler: find a goroutine and run it.
@@ -1202,7 +1231,7 @@ 
 		if(atomic_mcpu(v) > maxgomaxprocs)
 			runtime_throw("negative mcpu in scheduler");
 
-		switch(gp->status){
+		switch(gp->status) {
 		case Grunnable:
 		case Gdead:
 			// Shouldn't have been running!
@@ -1212,6 +1241,8 @@ 
 			gput(gp);
 			break;
 		case Gmoribund:
+			if(raceenabled)
+				runtime_racegoend(gp->goid);
 			gp->status = Gdead;
 			if(gp->lockedm) {
 				gp->lockedm = nil;
@@ -1224,7 +1255,7 @@ 
 				runtime_exit(0);
 			break;
 		}
-		if(gp->readyonstop){
+		if(gp->readyonstop) {
 			gp->readyonstop = 0;
 			readylocked(gp);
 		}
@@ -1272,6 +1303,18 @@ 
 	runtime_mcall(schedule);
 }
 
+// Puts the current goroutine into a waiting state and unlocks the lock.
+// The goroutine can be made runnable again by calling runtime_ready(gp).
+void
+runtime_park(void (*unlockf)(Lock*), Lock *lock, const char *reason)
+{
+	g->status = Gwaiting;
+	g->waitreason = reason;
+	if(unlockf)
+		unlockf(lock);
+	runtime_gosched();
+}
+
 // The goroutine g is about to enter a system call.
 // Record that it's not using the cpu anymore.
 // This is called only from the go syscall library and cgocall,
@@ -1448,10 +1491,15 @@ 
 	byte *sp;
 	size_t spsize;
 	G *newg;
+	int32 goid;
+
+	goid = runtime_xadd((uint32*)&runtime_sched.goidgen, 1);
+	if(raceenabled)
+		runtime_racegostart(goid, runtime_getcallerpc(&fn));
 
 	schedlock();
 
-	if((newg = gfget()) != nil){
+	if((newg = gfget()) != nil) {
 #ifdef USING_SPLIT_STACK
 		int dont_block_signals = 0;
 
@@ -1482,8 +1530,7 @@ 
 	newg->gopc = (uintptr)__builtin_return_address(0);
 
 	runtime_sched.gcount++;
-	runtime_sched.goidgen++;
-	newg->goid = runtime_sched.goidgen;
+	newg->goid = goid;
 
 	if(sp == nil)
 		runtime_throw("nil g->stack0");
@@ -1512,49 +1559,22 @@ 
 
 // Put on gfree list.  Sched must be locked.
 static void
-gfput(G *g)
+gfput(G *gp)
 {
-	g->schedlink = runtime_sched.gfree;
-	runtime_sched.gfree = g;
+	gp->schedlink = runtime_sched.gfree;
+	runtime_sched.gfree = gp;
 }
 
 // Get from gfree list.  Sched must be locked.
 static G*
 gfget(void)
 {
-	G *g;
+	G *gp;
 
-	g = runtime_sched.gfree;
-	if(g)
-		runtime_sched.gfree = g->schedlink;
-	return g;
-}
-
-// Run all deferred functions for the current goroutine.
-static void
-rundefer(void)
-{
-	Defer *d;
-
-	while((d = g->defer) != nil) {
-		void (*pfn)(void*);
-
-		pfn = d->__pfn;
-		d->__pfn = nil;
-		if (pfn != nil)
-			(*pfn)(d->__arg);
-		g->defer = d->__next;
-		runtime_free(d);
-	}
-}
-
-void runtime_Goexit (void) asm ("runtime.Goexit");
-
-void
-runtime_Goexit(void)
-{
-	rundefer();
-	runtime_goexit();
+	gp = runtime_sched.gfree;
+	if(gp)
+		runtime_sched.gfree = gp->schedlink;
+	return gp;
 }
 
 void runtime_Gosched (void) asm ("runtime.Gosched");
@@ -1651,10 +1671,10 @@ 
 	return m->id;
 }
 
-int32 runtime_NumGoroutine (void)
+intgo runtime_NumGoroutine (void)
   __asm__ ("runtime.NumGoroutine");
 
-int32
+intgo
 runtime_NumGoroutine()
 {
 	return runtime_sched.gcount;
diff -r bf12a7f41b67 libgo/runtime/race.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libgo/runtime/race.h	Mon Oct 22 17:36:23 2012 -0700
@@ -0,0 +1,30 @@ 
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Definitions related to data race detection.
+
+#ifdef RACE
+enum { raceenabled = 1 };
+#else
+enum { raceenabled = 0 };
+#endif
+
+// Initialize race detection subsystem.
+void	runtime_raceinit(void);
+// Finalize race detection subsystem, does not return.
+void	runtime_racefini(void);
+
+void	runtime_racemalloc(void *p, uintptr sz, void *pc);
+void	runtime_racefree(void *p);
+void	runtime_racegostart(int32 goid, void *pc);
+void	runtime_racegoend(int32 goid);
+void	runtime_racewritepc(void *addr, void *pc);
+void	runtime_racereadpc(void *addr, void *pc);
+void	runtime_racefingo(void);
+void	runtime_raceacquire(void *addr);
+void	runtime_raceacquireg(G *gp, void *addr);
+void	runtime_racerelease(void *addr);
+void	runtime_racereleaseg(G *gp, void *addr);
+void	runtime_racereleasemerge(void *addr);
+void	runtime_racereleasemergeg(G *gp, void *addr);
diff -r bf12a7f41b67 libgo/runtime/runtime.c
--- a/libgo/runtime/runtime.c	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/runtime.c	Mon Oct 22 17:36:23 2012 -0700
@@ -4,13 +4,13 @@ 
 
 #include <unistd.h>
 
+#include "config.h"
+
 #include "runtime.h"
 #include "array.h"
 #include "go-panic.h"
 #include "go-string.h"
 
-uint32	runtime_panicking;
-
 int32
 runtime_gotraceback(void)
 {
@@ -22,84 +22,6 @@ 
 	return runtime_atoi(p);
 }
 
-static Lock paniclk;
-
-void
-runtime_startpanic(void)
-{
-	M *m;
-
-	m = runtime_m();
-	if(m->dying) {
-		runtime_printf("panic during panic\n");
-		runtime_exit(3);
-	}
-	m->dying = 1;
-	runtime_xadd(&runtime_panicking, 1);
-	runtime_lock(&paniclk);
-}
-
-void
-runtime_dopanic(int32 unused __attribute__ ((unused)))
-{
-	G* g;
-	static bool didothers;
-
-	g = runtime_g();
-	if(g->sig != 0)
-		runtime_printf("[signal %x code=%p addr=%p]\n",
-			g->sig, (void*)(g->sigcode0), (void*)(g->sigcode1));
-
-	if(runtime_gotraceback()){
-		if(g != runtime_m()->g0) {
-			runtime_printf("\n");
-			runtime_goroutineheader(g);
-			runtime_traceback();
-			runtime_goroutinetrailer(g);
-		}
-		if(!didothers) {
-			didothers = true;
-			runtime_tracebackothers(g);
-		}
-	}
-
-	runtime_unlock(&paniclk);
-	if(runtime_xadd(&runtime_panicking, -1) != 0) {
-		// Some other m is panicking too.
-		// Let it print what it needs to print.
-		// Wait forever without chewing up cpu.
-		// It will exit when it's done.
-		static Lock deadlock;
-		runtime_lock(&deadlock);
-		runtime_lock(&deadlock);
-	}
-
-	runtime_exit(2);
-}
-
-void
-runtime_throw(const char *s)
-{
-	runtime_startpanic();
-	runtime_printf("throw: %s\n", s);
-	runtime_dopanic(0);
-	*(int32*)0 = 0;	// not reached
-	runtime_exit(1);	// even more not reached
-}
-
-void
-runtime_panicstring(const char *s)
-{
-	Eface err;
-
-	if(runtime_m()->gcing) {
-		runtime_printf("panic: %s\n", s);
-		runtime_throw("panic during gc");
-	}
-	runtime_newErrorString(runtime_gostringnocopy((const byte*)s), &err);
-	runtime_panic(err);
-}
-
 static int32	argc;
 static byte**	argv;
 
@@ -247,14 +169,41 @@ 
 	return traceback > 1 || (s != nil && __builtin_strchr((const char*)s, '.') != nil && __builtin_memcmp(s, "runtime.", 7) != 0);
 }
 
-bool
-runtime_isInf(float64 f, int32 sign)
+static Lock ticksLock;
+static int64 ticks;
+
+int64
+runtime_tickspersecond(void)
 {
-	if(!__builtin_isinf(f))
-		return false;
-	if(sign == 0)
-		return true;
-	if(sign > 0)
-		return f > 0;
-	return f < 0;
+	int64 res, t0, t1, c0, c1;
+
+	res = (int64)runtime_atomicload64((uint64*)&ticks);
+	if(res != 0)
+		return ticks;
+	runtime_lock(&ticksLock);
+	res = ticks;
+	if(res == 0) {
+		t0 = runtime_nanotime();
+		c0 = runtime_cputicks();
+		runtime_usleep(100*1000);
+		t1 = runtime_nanotime();
+		c1 = runtime_cputicks();
+		if(t1 == t0)
+			t1++;
+		res = (c1-c0)*1000*1000*1000/(t1-t0);
+		if(res == 0)
+			res++;
+		runtime_atomicstore64((uint64*)&ticks, res);
+	}
+	runtime_unlock(&ticksLock);
+	return res;
 }
+
+int64 runtime_pprof_runtime_cyclesPerSecond(void)
+     asm("runtime_pprof.runtime_cyclesPerSecond");
+
+int64
+runtime_pprof_runtime_cyclesPerSecond(void)
+{
+	return runtime_tickspersecond();
+}
diff -r bf12a7f41b67 libgo/runtime/runtime.h
--- a/libgo/runtime/runtime.h	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/runtime.h	Mon Oct 22 17:36:23 2012 -0700
@@ -1,8 +1,6 @@ 
-/* runtime.h -- runtime support for Go.
-
-   Copyright 2009 The Go Authors. All rights reserved.
-   Use of this source code is governed by a BSD-style
-   license that can be found in the LICENSE file.  */
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
 
 #include "config.h"
 
@@ -42,8 +40,12 @@ 
 typedef unsigned int uint64  __attribute__ ((mode (DI)));
 typedef float        float32 __attribute__ ((mode (SF)));
 typedef double       float64 __attribute__ ((mode (DF)));
+typedef signed int   intptr __attribute__ ((mode (pointer)));
 typedef unsigned int uintptr __attribute__ ((mode (pointer)));
 
+typedef int		intgo; // Go's int
+typedef unsigned int	uintgo; // Go's uint
+
 /* Defined types.  */
 
 typedef	uint8			bool;
@@ -59,6 +61,10 @@ 
 typedef	struct	Hchan		Hchan;
 typedef	struct	Timers		Timers;
 typedef	struct	Timer		Timer;
+typedef struct	GCStats		GCStats;
+typedef struct	LFNode		LFNode;
+typedef struct	ParFor		ParFor;
+typedef struct	ParForThread	ParForThread;
 
 typedef	struct	__go_open_array		Slice;
 typedef	struct	__go_string		String;
@@ -105,6 +111,10 @@ 
 	true	= 1,
 	false	= 0,
 };
+enum
+{
+	PtrSize = sizeof(void*),
+};
 
 /*
  * structures
@@ -119,6 +129,16 @@ 
 	uint32	key;	// futex-based impl
 	M*	waitm;	// waiting M (sema-based impl)
 };
+struct	GCStats
+{
+	// the struct must consist of only uint64's,
+	// because it is casted to uint64[].
+	uint64	nhandoff;
+	uint64	nhandoffcnt;
+	uint64	nprocyield;
+	uint64	nosyield;
+	uint64	nsleep;
+};
 struct	G
 {
 	Defer*	defer;
@@ -142,6 +162,7 @@ 
 	G*	schedlink;
 	bool	readyonstop;
 	bool	ispanic;
+	int8	raceignore; // ignore race detection events
 	M*	m;		// for debuggers, but offset not hard-coded
 	M*	lockedm;
 	M*	idlem;
@@ -190,6 +211,14 @@ 
 	uintptr	waitsema;	// semaphore for parking on locks
 	uint32	waitsemacount;
 	uint32	waitsemalock;
+	GCStats	gcstats;
+	bool	racecall;
+	void*	racepc;
+
+	uintptr	settype_buf[1024];
+	uintptr	settype_bufsize;
+
+	uintptr	end[];
 };
 
 struct	SigTab
@@ -218,7 +247,6 @@ 
 	uintptr	entry;	// entry pc
 };
 
-/* Macros.  */
 
 #ifdef GOOS_windows
 enum {
@@ -257,6 +285,34 @@ 
 	Eface	arg;
 };
 
+// Lock-free stack node.
+struct LFNode
+{
+	LFNode	*next;
+	uintptr	pushcnt;
+};
+
+// Parallel for descriptor.
+struct ParFor
+{
+	void (*body)(ParFor*, uint32);	// executed for each element
+	uint32 done;			// number of idle threads
+	uint32 nthr;			// total number of threads
+	uint32 nthrmax;			// maximum number of threads
+	uint32 thrseq;			// thread id sequencer
+	uint32 cnt;			// iteration space [0, cnt)
+	void *ctx;			// arbitrary user context
+	bool wait;			// if true, wait while all threads finish processing,
+					// otherwise parfor may return while other threads are still working
+	ParForThread *thr;		// array of thread descriptors
+	// stats
+	uint64 nsteal;
+	uint64 nstealcnt;
+	uint64 nprocyield;
+	uint64 nosyield;
+	uint64 nsleep;
+};
+
 /*
  * defined macros
  *    you need super-gopher-guru privilege
@@ -265,6 +321,7 @@ 
 #define	nelem(x)	(sizeof(x)/sizeof((x)[0]))
 #define	nil		((void*)0)
 #define USED(v)		((void) v)
+#define	ROUND(x, n)	(((x)+(n)-1)&~((n)-1)) /* all-caps to mark as macro: it evaluates n twice */
 
 /*
  * external data
@@ -312,7 +369,8 @@ 
 void	runtime_minit(void);
 void	runtime_mallocinit(void);
 void	runtime_gosched(void);
-void	runtime_tsleep(int64);
+void	runtime_park(void(*)(Lock*), Lock*, const char*);
+void	runtime_tsleep(int64, const char*);
 M*	runtime_newm(void);
 void	runtime_goexit(void);
 void	runtime_entersyscall(void) __asm__("syscall.Entersyscall");
@@ -322,9 +380,12 @@ 
 int32	runtime_callers(int32, uintptr*, int32);
 int64	runtime_nanotime(void);
 int64	runtime_cputicks(void);
+int64	runtime_tickspersecond(void);
+void	runtime_blockevent(int64, int32);
+extern int64 runtime_blockprofilerate;
 
 void	runtime_stoptheworld(void);
-void	runtime_starttheworld(bool);
+void	runtime_starttheworld(void);
 extern uint32 runtime_worldsema;
 G*	__go_go(void (*pfn)(void*), void*);
 
@@ -372,6 +433,28 @@ 
 void	runtime_futexwakeup(uint32*, uint32);
 
 /*
+ * Lock-free stack.
+ * Initialize uint64 head to 0, compare with 0 to test for emptiness.
+ * The stack does not keep pointers to nodes,
+ * so they can be garbage collected if there are no other pointers to nodes.
+ */
+void	runtime_lfstackpush(uint64 *head, LFNode *node)
+  asm("runtime.lfstackpush");
+LFNode*	runtime_lfstackpop(uint64 *head);
+
+/*
+ * Parallel for over [0, n).
+ * body() is executed for each iteration.
+ * nthr - total number of worker threads.
+ * ctx - arbitrary user context.
+ * if wait=true, threads return from parfor() when all work is done;
+ * otherwise, threads can return while other threads are still finishing processing.
+ */
+ParFor*	runtime_parforalloc(uint32 nthrmax);
+void	runtime_parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void (*body)(ParFor*, uint32));
+void	runtime_parfordo(ParFor *desc) asm("runtime.parfordo");
+
+/*
  * low level C-called
  */
 #define runtime_mmap mmap
@@ -432,12 +515,17 @@ 
 void	free(void *v);
 #define runtime_cas(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
 #define runtime_casp(pval, old, new) __sync_bool_compare_and_swap (pval, old, new)
+#define runtime_cas64(pval, pold, new) __atomic_compare_exchange_n (pval, pold, new, 1, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)
 #define runtime_xadd(p, v) __sync_add_and_fetch (p, v)
+#define runtime_xadd64(p, v) __sync_add_and_fetch (p, v)
 #define runtime_xchg(p, v) __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST)
 #define runtime_atomicload(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
 #define runtime_atomicstore(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
 #define runtime_atomicloadp(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
 #define runtime_atomicstorep(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
+#define runtime_atomicload64(p) __atomic_load_n (p, __ATOMIC_SEQ_CST)
+#define runtime_atomicstore64(p, v) __atomic_store_n (p, v, __ATOMIC_SEQ_CST)
+#define PREFETCH(p) __builtin_prefetch(p)
 
 struct __go_func_type;
 bool	runtime_addfinalizer(void*, void(*fn)(void*), const struct __go_func_type *);
@@ -469,8 +557,7 @@ 
 /*
  * wrapped for go users
  */
-bool	runtime_isInf(float64 f, int32 sign);
-#define runtime_isNaN(f) __builtin_isnan(f)
+#define ISNAN(f) __builtin_isnan(f)
 void	runtime_semacquire(uint32 volatile *);
 void	runtime_semrelease(uint32 volatile *);
 int32	runtime_gomaxprocsfunc(int32 n);
@@ -493,8 +580,13 @@ 
 // This is a no-op on other systems.
 void	runtime_setprof(bool);
 
-void	runtime_time_scan(void (*)(byte*, int64));
-void	runtime_trampoline_scan(void (*)(byte *, int64));
+enum
+{
+	UseSpanType = 1,
+};
+
+void	runtime_time_scan(void (*)(byte*, uintptr));
+void	runtime_trampoline_scan(void (*)(byte *, uintptr));
 
 void	runtime_setsig(int32, bool, bool);
 #define runtime_setitimer setitimer
diff -r bf12a7f41b67 libgo/runtime/runtime1.goc
--- a/libgo/runtime/runtime1.goc	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/runtime1.goc	Mon Oct 22 17:36:23 2012 -0700
@@ -5,10 +5,10 @@ 
 package runtime
 #include "runtime.h"
 
-func GOMAXPROCS(n int32) (ret int32) {
+func GOMAXPROCS(n int) (ret int) {
 	ret = runtime_gomaxprocsfunc(n);
 }
 
-func NumCPU() (ret int32) {
+func NumCPU() (ret int) {
 	ret = runtime_ncpu;
 }
diff -r bf12a7f41b67 libgo/runtime/sema.goc
--- a/libgo/runtime/sema.goc	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/sema.goc	Mon Oct 22 17:36:23 2012 -0700
@@ -24,30 +24,32 @@ 
 typedef struct Sema Sema;
 struct Sema
 {
-	uint32 volatile *addr;
-	G *g;
-	Sema *prev;
-	Sema *next;
+	uint32 volatile*	addr;
+	G*	g;
+	int64	releasetime;
+	Sema*	prev;
+	Sema*	next;
 };
 
 typedef struct SemaRoot SemaRoot;
 struct SemaRoot
 {
-        Lock;
-	Sema *head;
-	Sema *tail;
+	Lock;
+	Sema*	head;
+	Sema*	tail;
 	// Number of waiters. Read w/o the lock.
-	uint32 volatile nwait;
+	uint32 volatile	nwait;
 };
 
 // Prime to not correlate with any user patterns.
 #define SEMTABLESZ 251
 
-static union
+union semtable
 {
 	SemaRoot;
 	uint8 pad[CacheLineSize];
-} semtable[SEMTABLESZ];
+};
+static union semtable semtable[SEMTABLESZ];
 
 static SemaRoot*
 semroot(uint32 volatile *addr)
@@ -95,13 +97,13 @@ 
 	return 0;
 }
 
-void
-runtime_semacquire(uint32 volatile *addr)
+static void
+semacquireimpl(uint32 volatile *addr, int32 profile)
 {
-	G *g;
-	Sema s;
+	Sema s;	// Needs to be allocated on stack, otherwise garbage collector could deallocate it
 	SemaRoot *root;
-
+	int64 t0;
+	
 	// Easy case.
 	if(cansemacquire(addr))
 		return;
@@ -112,8 +114,13 @@ 
 	//	enqueue itself as a waiter
 	//	sleep
 	//	(waiter descriptor is dequeued by signaler)
-	g = runtime_g();
 	root = semroot(addr);
+	t0 = 0;
+	s.releasetime = 0;
+	if(profile && runtime_blockprofilerate > 0) {
+		t0 = runtime_cputicks();
+		s.releasetime = -1;
+	}
 	for(;;) {
 
 		runtime_lock(root);
@@ -128,16 +135,22 @@ 
 		// Any semrelease after the cansemacquire knows we're waiting
 		// (we set nwait above), so go to sleep.
 		semqueue(root, addr, &s);
-		g->status = Gwaiting;
-		g->waitreason = "semacquire";
-		runtime_unlock(root);
-		runtime_gosched();
-		if(cansemacquire(addr))
+		runtime_park(runtime_unlock, root, "semacquire");
+		if(cansemacquire(addr)) {
+			if(t0)
+				runtime_blockevent(s.releasetime - t0, 3);
 			return;
+		}
 	}
 }
 
 void
+runtime_semacquire(uint32 volatile *addr)
+{
+	semacquireimpl(addr, 0);
+}
+
+void
 runtime_semrelease(uint32 volatile *addr)
 {
 	Sema *s;
@@ -168,12 +181,15 @@ 
 		}
 	}
 	runtime_unlock(root);
-	if(s)
+	if(s) {
+		if(s->releasetime)
+			s->releasetime = runtime_cputicks();
 		runtime_ready(s->g);
+	}
 }
 
 func runtime_Semacquire(addr *uint32) {
-	runtime_semacquire(addr);
+	semacquireimpl(addr, 1);
 }
 
 func runtime_Semrelease(addr *uint32) {
diff -r bf12a7f41b67 libgo/runtime/string.goc
--- a/libgo/runtime/string.goc	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/string.goc	Mon Oct 22 17:36:23 2012 -0700
@@ -32,8 +32,8 @@ 
 	Runeself	= 0x80,
 };
 
-func stringiter(s String, k int32) (retk int32) {
-	int32 l, n;
+func stringiter(s String, k int) (retk int) {
+	int32 l;
 
 	if(k >= s.__length) {
 		// retk=0 is end of iteration
@@ -48,15 +48,12 @@ 
 	}
 
 	// multi-char rune
-	n = charntorune(&l, s.__data+k, s.__length-k);
-	retk = k + (n ? n : 1);
+	retk = k + charntorune(&l, s.__data+k, s.__length-k);
 
 out:
 }
 
-func stringiter2(s String, k int32) (retk int32, retv int32) {
-	int32 n;
-
+func stringiter2(s String, k int) (retk int, retv int) {
 	if(k >= s.__length) {
 		// retk=0 is end of iteration
 		retk = 0;
@@ -71,8 +68,7 @@ 
 	}
 
 	// multi-char rune
-	n = charntorune(&retv, s.__data+k, s.__length-k);
-	retk = k + (n ? n : 1);
+	retk = k + charntorune(&retv, s.__data+k, s.__length-k);
 
 out:
 }
diff -r bf12a7f41b67 libgo/runtime/time.goc
--- a/libgo/runtime/time.goc	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/runtime/time.goc	Mon Oct 22 17:36:23 2012 -0700
@@ -10,6 +10,7 @@ 
 #include "defs.h"
 #include "arch.h"
 #include "malloc.h"
+#include "race.h"
 
 static Timers timers;
 static void addtimer(Timer*);
@@ -22,17 +23,16 @@ 
 
 // Sleep puts the current goroutine to sleep for at least ns nanoseconds.
 func Sleep(ns int64) {
-	G *g;
-
-	g = runtime_g();
-	g->status = Gwaiting;
-	g->waitreason = "sleep";
-	runtime_tsleep(ns);
+	runtime_tsleep(ns, "sleep");
 }
 
 // startTimer adds t to the timer heap.
 func startTimer(t *Timer) {
+	if(raceenabled)
+		runtime_racerelease(t);
+	runtime_lock(&timers);
 	addtimer(t);
+	runtime_unlock(&timers);
 }
 
 // stopTimer removes t from the timer heap if it is there.
@@ -57,27 +57,24 @@ 
 }
 
 // Put the current goroutine to sleep for ns nanoseconds.
-// The caller must have set g->status and g->waitreason.
 void
-runtime_tsleep(int64 ns)
+runtime_tsleep(int64 ns, const char *reason)
 {
 	G* g;
 	Timer t;
 
 	g = runtime_g();
 
-	if(ns <= 0) {
-		g->status = Grunning;
-		g->waitreason = nil;
+	if(ns <= 0)
 		return;
-	}
 
 	t.when = runtime_nanotime() + ns;
 	t.period = 0;
 	t.f = ready;
 	t.arg.__object = g;
+	runtime_lock(&timers);
 	addtimer(&t);
-	runtime_gosched();
+	runtime_park(runtime_unlock, &timers, reason);
 }
 
 // Add a timer to the heap and start or kick the timer proc
@@ -88,7 +85,6 @@ 
 	int32 n;
 	Timer **nt;
 
-	runtime_lock(&timers);
 	if(timers.len >= timers.cap) {
 		// Grow slice.
 		n = 16;
@@ -116,7 +112,6 @@ 
 	}
 	if(timers.timerproc == nil)
 		timers.timerproc = __go_go(timerproc, nil);
-	runtime_unlock(&timers);
 }
 
 // Delete timer t from the heap.
@@ -159,13 +154,11 @@ 
 static void
 timerproc(void* dummy __attribute__ ((unused)))
 {
-	G *g;
 	int64 delta, now;
 	Timer *t;
 	void (*f)(int64, Eface);
 	Eface arg;
 
-	g = runtime_g();
 	for(;;) {
 		runtime_lock(&timers);
 		now = runtime_nanotime();
@@ -192,16 +185,15 @@ 
 			f = t->f;
 			arg = t->arg;
 			runtime_unlock(&timers);
+			if(raceenabled)
+				runtime_raceacquire(t);
 			f(now, arg);
 			runtime_lock(&timers);
 		}
 		if(delta < 0) {
 			// No timers left - put goroutine to sleep.
 			timers.rescheduling = true;
-			g->status = Gwaiting;
-			g->waitreason = "timer goroutine (idle)";
-			runtime_unlock(&timers);
-			runtime_gosched();
+			runtime_park(runtime_unlock, &timers, "timer goroutine (idle)");
 			continue;
 		}
 		// At least one timer pending.  Sleep until then.
@@ -263,7 +255,7 @@ 
 }
 
 void
-runtime_time_scan(void (*scan)(byte*, int64))
+runtime_time_scan(void (*addroot)(byte*, uintptr))
 {
-	scan((byte*)&timers, sizeof timers);
+	addroot((byte*)&timers, sizeof timers);
 }
diff -r bf12a7f41b67 libgo/testsuite/gotest
--- a/libgo/testsuite/gotest	Sun Oct 07 21:29:09 2012 -0700
+++ b/libgo/testsuite/gotest	Mon Oct 22 17:36:23 2012 -0700
@@ -346,6 +346,11 @@ 
 
 # They all compile; now generate the code to call them.
 
+testname() {
+	# Remove the package from the name used with the -test option.
+	echo $1 | sed 's/^.*\.//'
+}
+
 localname() {
 	# The package main has been renamed to __main__ when imported.
 	# Adjust its uses.
@@ -373,7 +378,7 @@ 
 	fi
 	# benchmarks are named BenchmarkFoo.
 	pattern='Benchmark([^a-z].*)?'
-	benchmarks=$($NM -p -v _gotest_.o $xofile | egrep " $test .*\."$pattern'$' | grep -v '\..*\..*\.' | fgrep -v '$' | fgrep -v ' __go_' | sed 's/.* //' | $symtogo)
+	benchmarks=$($NM -p -v _gotest_.o $xofile | egrep " $text .*\."$pattern'$' | grep -v '\..*\..*\.' | fgrep -v '$' | fgrep -v ' __go_' | sed 's/.* //' | $symtogo)
 
 	# examples are named ExampleFoo
 	pattern='Example([^a-z].*)?'
@@ -396,8 +401,9 @@ 
 	echo 'var tests = []testing.InternalTest {'
 	for i in $tests
 	do
+		n=$(testname $i)
 		j=$(localname $i)
-		echo '	{"'$i'", '$j'},'
+		echo '	{"'$n'", '$j'},'
 	done
 	echo '}'
 
@@ -407,8 +413,9 @@ 
 	echo 'var benchmarks = []testing.InternalBenchmark{ //'
 	for i in $benchmarks
 	do
+		n=$(testname $i)
 		j=$(localname $i)
-		echo '	{"'$i'", '$j'},'
+		echo '	{"'$n'", '$j'},'
 	done
 	echo '}'
 
@@ -417,8 +424,9 @@ 
 	# This doesn't work because we don't pick up the output.
 	#for i in $examples
 	#do
+	#	n=$(testname $i)
 	#	j=$(localname $i)
-	#	echo '	{"'$i'", '$j', ""},'
+	#	echo '	{"'$n'", '$j', ""},'
 	#done
 	echo '}'
 
Index: test/fixedbugs/bug358.go
===================================================================
--- test/fixedbugs/bug358.go	(revision 192508)
+++ test/fixedbugs/bug358.go	(working copy)
@@ -12,7 +12,7 @@  package main
 import (
 	"io/ioutil"	// GCCGO_ERROR "imported and not used"
 	"net/http"
-	"os"
+	"os"		// GCCGO_ERROR "imported and not used"
 )
 
 func makeHandler(fn func(http.ResponseWriter, *http.Request, string)) http.HandlerFunc {
Index: test/fixedbugs/bug369.go
===================================================================
--- test/fixedbugs/bug369.go	(revision 192508)
+++ test/fixedbugs/bug369.go	(working copy)
@@ -38,9 +38,9 @@  func BenchmarkSlowNonASCII(b *testing.B)
 }
 
 func main() {
-	os.Args = []string{os.Args[0], "-test.benchtime=0.1"}
+	os.Args = []string{os.Args[0], "-test.benchtime=100ms"}
 	flag.Parse()
-	
+
 	rslow := testing.Benchmark(BenchmarkSlowNonASCII)
 	rfast := testing.Benchmark(BenchmarkFastNonASCII)
 	tslow := rslow.NsPerOp()