--- //depot/user/rwatson/netperf/sys/conf/files	2004/11/03 14:10:39
+++ //depot/user/rwatson/percpu/sys/conf/files	2004/11/04 00:18:33
@@ -1675,6 +1675,7 @@
 security/mac_seeotheruids/mac_seeotheruids.c	optional mac_seeotheruids
 security/mac_stub/mac_stub.c	optional mac_stub
 security/mac_test/mac_test.c	optional mac_test
+test/test_synch_timing.c	standard
 ufs/ffs/ffs_alloc.c	optional ffs
 ufs/ffs/ffs_balloc.c	optional ffs
 ufs/ffs/ffs_inode.c	optional ffs
--- //depot/user/rwatson/netperf/sys/kern/kern_switch.c	2004/10/19 18:01:38
+++ //depot/user/rwatson/percpu/sys/kern/kern_switch.c	2004/11/07 18:32:56
@@ -564,6 +564,18 @@
  * Kernel thread preemption implementation.  Critical sections mark
  * regions of code in which preemptions are not allowed.
  */
+#ifdef INVARIANT_SUPPORT
+void
+_critical_assert(const char *file, int line)
+{
+
+	if (curthread->td_critnest == 0) {
+		panic("critical_asssert: critical section not owned "
+		    "at %s:%d", file, line);
+	}
+}
+#endif
+
 void
 critical_enter(void)
 {
@@ -573,6 +585,8 @@
 	if (td->td_critnest == 0)
 		cpu_critical_enter(td);
 	td->td_critnest++;
+	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
+	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
 }
 
 void
@@ -601,6 +615,8 @@
 	} else {
 		td->td_critnest--;
 	}
+	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
+	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
 }
 
 /*
--- //depot/user/rwatson/netperf/sys/sys/ktr.h	2004/11/03 14:10:39
+++ //depot/user/rwatson/percpu/sys/sys/ktr.h	2004/11/07 18:32:56
@@ -76,7 +76,8 @@
 #define	KTR_CALLOUT	0x02000000		/* Callouts and timeouts */
 #define	KTR_GEOM	0x04000000		/* GEOM I/O events */
 #define	KTR_BUSDMA	0x08000000		/* busdma(9) events */
-#define	KTR_ALL		0x0fffffff
+#define	KTR_CRITICAL	0010000000		/* Critical sections */
+#define	KTR_ALL		0x1fffffff
 
 /*
  * Trace classes which can be assigned to particular use at compile time
--- //depot/user/rwatson/netperf/sys/sys/systm.h	2004/11/03 14:10:39
+++ //depot/user/rwatson/percpu/sys/sys/systm.h	2004/11/06 17:55:50
@@ -86,6 +86,12 @@
 #define	__CTASSERT(x, y)	typedef char __assert ## y[(x) ? 1 : -1]
 #endif
 
+#ifdef	INVARIANTS
+#define	critical_assert()	_critical_assert(__FILE__, __LINE__)
+#else
+#define	critical_assert()
+#endif
+
 /*
  * XXX the hints declarations are even more misplaced than most declarations
  * in this file, since they are needed in one file (per arch) and only used
@@ -139,6 +145,9 @@
 void	cpu_rootconf(void);
 extern uint32_t crc32_tab[];
 uint32_t crc32(const void *buf, size_t size);
+#ifdef INVARIANT_SUPPORT
+void	_critical_assert(const char *file, int line);
+#endif
 void	critical_enter(void);
 void	critical_exit(void);
 void	init_param1(void);
--- //depot/user/rwatson/netperf/sys/vm/uma_core.c	2004/11/06 12:44:50
+++ //depot/user/rwatson/percpu/sys/vm/uma_core.c	2004/11/06 19:46:49
@@ -67,6 +67,7 @@
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 
@@ -382,48 +383,19 @@
 zone_timeout(uma_zone_t zone)
 {
 	uma_keg_t keg;
-	uma_cache_t cache;
 	u_int64_t alloc;
-	int cpu;
 
 	keg = zone->uz_keg;
 	alloc = 0;
 
 	/*
-	 * Aggregate per cpu cache statistics back to the zone.
-	 *
-	 * XXX This should be done in the sysctl handler.
-	 *
-	 * I may rewrite this to set a flag in the per cpu cache instead of
-	 * locking.  If the flag is not cleared on the next round I will have
-	 * to lock and do it here instead so that the statistics don't get too
-	 * far out of sync.
-	 */
-	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) {
-		for (cpu = 0; cpu <= mp_maxid; cpu++) {
-			if (CPU_ABSENT(cpu))
-				continue;
-			CPU_LOCK(cpu);
-			cache = &zone->uz_cpu[cpu];
-			/* Add them up, and reset */
-			alloc += cache->uc_allocs;
-			cache->uc_allocs = 0;
-			CPU_UNLOCK(cpu);
-		}
-	}
-
-	/* Now push these stats back into the zone.. */
-	ZONE_LOCK(zone);
-	zone->uz_allocs += alloc;
-
-	/*
 	 * Expand the zone hash table.
 	 *
 	 * This is done if the number of slabs is larger than the hash size.
 	 * What I'm trying to do here is completely reduce collisions.  This
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
-
+	ZONE_LOCK(zone);
 	if (keg->uk_flags & UMA_ZONE_HASH &&
 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
@@ -611,6 +583,10 @@
 /*
  * Drains the per cpu caches for a zone.
  *
+ * NOTE: This may only be called while the zone is being turn down, and not
+ * during normal operation.  This is necessary in order that we do not have
+ * to migrate CPUs to drain the per-CPU caches.
+ *
  * Arguments:
  *	zone     The zone to drain, must be unlocked.
  *
@@ -624,12 +600,20 @@
 	int cpu;
 
 	/*
-	 * We have to lock each cpu cache before locking the zone
+	 * XXX: It is safe to not lock the per-CPU caches, because we're
+	 * tearing down the zone anyway.  I.e., there will be no further use
+	 * of the caches at this point.
+	 *
+	 * XXX: It would good to be able to assert that the zone is being
+	 * torn down to prevent improper use of cache_drain().
+	 *
+	 * XXX: We lock the zone before passing into bucket_cache_drain() as
+	 * it is used elsewhere.  Should the tear-down path be made special
+	 * there in some form?
 	 */
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (CPU_ABSENT(cpu))
 			continue;
-		CPU_LOCK(cpu);
 		cache = &zone->uz_cpu[cpu];
 		bucket_drain(zone, cache->uc_allocbucket);
 		bucket_drain(zone, cache->uc_freebucket);
@@ -642,11 +626,6 @@
 	ZONE_LOCK(zone);
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
-		if (CPU_ABSENT(cpu))
-			continue;
-		CPU_UNLOCK(cpu);
-	}
 }
 
 /*
@@ -1784,6 +1763,7 @@
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int cpu;
+	int count;
 	int badness;
 
 	/* This is the fast path allocation */
@@ -1818,12 +1798,28 @@
 		}
 	}
 
+	/*
+	 * If possible, allocate from the per-CPU cache.  We require two
+	 * types of protection: (1) we use critical sections to protect the
+	 * cache from corruption and races, and (2) we rely on CPU pinning
+	 * to fix which cache we work from in the event that we must
+	 * release the critical section.  Neither alone is sufficient due
+	 * to preemption and possible processor migration for the current
+	 * thread.  We may need to exist the critical section if we have to
+	 * acquire the zone mutex, which may require sleeping.
+	 */
+	count = 0;
 zalloc_restart:
+	sched_pin();
+	critical_enter();
 	cpu = PCPU_GET(cpuid);
-	CPU_LOCK(cpu);
 	cache = &zone->uz_cpu[cpu];
 
 zalloc_start:
+	count++;
+	KASSERT(count < 10, ("uma_zalloc_arg: count == 10"));
+	/* XXXRW: assert pinned? */
+	critical_assert();
 	bucket = cache->uc_allocbucket;
 
 	if (bucket) {
@@ -1836,12 +1832,13 @@
 			KASSERT(item != NULL,
 			    ("uma_zalloc: Bucket pointer mangled."));
 			cache->uc_allocs++;
+			critical_exit();
+			sched_unpin();
 #ifdef INVARIANTS
 			ZONE_LOCK(zone);
 			uma_dbg_alloc(zone, NULL, item);
 			ZONE_UNLOCK(zone);
 #endif
-			CPU_UNLOCK(cpu);
 			if (zone->uz_ctor != NULL) {
 				if (zone->uz_ctor(item, zone->uz_keg->uk_size,
 				    udata, flags) != 0) {
@@ -1871,7 +1868,34 @@
 			}
 		}
 	}
+	/*
+	 * Attempt to retrieve the item from the per-CPU cache has failed,
+	 * so we must go back to the zone.  This requires the zone lock, so
+	 * we must drop the critical section, then re-acquire it when we go
+	 * back to the cache.  Since the critical section is released, we
+	 * may be preempted.  However, as we are pinned, we should be able
+	 * to return to the same cache.  If things have changed
+	 * sufficiently that our assumptions about need to allocate have
+	 * changed, restart.
+	 */
+	critical_exit();
 	ZONE_LOCK(zone);
+	critical_enter();
+	KASSERT(cache == &zone->uz_cpu[PCPU_GET(cpuid)],
+	    ("uma_zalloc_arg: CPU changed!"));
+	bucket = cache->uc_allocbucket;
+	if (bucket != NULL) {
+		if (bucket != NULL && bucket->ub_cnt > 0) {
+			ZONE_UNLOCK(zone);
+			goto zalloc_start;
+		}
+		bucket = cache->uc_freebucket;
+		if (bucket != NULL && bucket->ub_cnt > 0) {
+			ZONE_UNLOCK(zone);
+			goto zalloc_start;
+		}
+	}
+
 	/* Since we have locked the zone we may as well send back our stats */
 	zone->uz_allocs += cache->uc_allocs;
 	cache->uc_allocs = 0;
@@ -1896,7 +1920,8 @@
 		goto zalloc_start;
 	}
 	/* We are no longer associated with this cpu!!! */
-	CPU_UNLOCK(cpu);
+	critical_exit();
+	sched_unpin();
 
 	/* Bump up our uz_count so we get here less */
 	if (zone->uz_count < BUCKET_MAX)
@@ -2209,6 +2234,7 @@
 	uma_bucket_t bucket;
 	int bflags;
 	int cpu;
+	int count;
 	enum zfreeskip skip;
 
 	/* This is the fast path free */
@@ -2234,12 +2260,18 @@
 		skip = SKIP_DTOR;
 	}
 
+	count = 0;
 zfree_restart:
+	sched_pin();
+	critical_enter();
 	cpu = PCPU_GET(cpuid);
-	CPU_LOCK(cpu);
 	cache = &zone->uz_cpu[cpu];
 
 zfree_start:
+	/* XXXRW: assert pinned? */
+	count++;
+	KASSERT(count < 10, ("uma_zfree_arg: count == 10"));
+	critical_assert();
 	bucket = cache->uc_freebucket;
 
 	if (bucket) {
@@ -2253,6 +2285,8 @@
 			    ("uma_zfree: Freeing to non free bucket index."));
 			bucket->ub_bucket[bucket->ub_cnt] = item;
 			bucket->ub_cnt++;
+			critical_exit();
+			sched_unpin();
 #ifdef INVARIANTS
 			ZONE_LOCK(zone);
 			if (keg->uk_flags & UMA_ZONE_MALLOC)
@@ -2261,7 +2295,6 @@
 				uma_dbg_free(zone, NULL, item);
 			ZONE_UNLOCK(zone);
 #endif
-			CPU_UNLOCK(cpu);
 			return;
 		} else if (cache->uc_allocbucket) {
 #ifdef UMA_DEBUG_ALLOC
@@ -2285,9 +2318,32 @@
 	 *
 	 * 1) The buckets are NULL
 	 * 2) The alloc and free buckets are both somewhat full.
+	 *
+	 * Since we acquire the zone lock, we must first release the
+	 * critical section, as we might otherwise sleep.  As a result, we
+	 * may be preempted, so check if that happened and restart is so.
+	 * Note that we remain pinned, so the same cache should still be
+	 * appropriate.
 	 */
+	critical_exit();
+	ZONE_LOCK(zone);
+	critical_enter();
+	KASSERT(cache == &zone->uz_cpu[PCPU_GET(cpuid)],
+	    ("uma_zalloc_arg: CPU changed!"));
 
-	ZONE_LOCK(zone);
+	if (cache->uc_freebucket != NULL) {
+		if (cache->uc_freebucket->ub_cnt <
+		    cache->uc_freebucket->ub_entries) {
+			ZONE_UNLOCK(zone);
+			goto zfree_start;
+		}
+		if (cache->uc_allocbucket != NULL &&
+		    (cache->uc_allocbucket->ub_cnt <
+		    cache->uc_freebucket->ub_cnt)) {
+			ZONE_UNLOCK(zone);
+			goto zfree_start;
+		}
+	}
 
 	bucket = cache->uc_freebucket;
 	cache->uc_freebucket = NULL;
@@ -2310,7 +2366,8 @@
 		goto zfree_start;
 	}
 	/* We're done with this CPU now */
-	CPU_UNLOCK(cpu);
+	critical_exit();
+	sched_unpin();
 
 	/* And the zone.. */
 	ZONE_UNLOCK(zone);
@@ -2724,6 +2781,7 @@
 	int cachefree;
 	uma_bucket_t bucket;
 	uma_cache_t cache;
+	u_int64_t alloc;
 
 	cnt = 0;
 	mtx_lock(&uma_mtx);
@@ -2747,6 +2805,13 @@
 	  LIST_FOREACH(z, &zk->uk_zones, uz_link) {
 		if (cnt == 0)	/* list may have changed size */
 			break;
+		/*
+		 * XXXRW: This is the last remaining use of a per-CPU mutex
+		 * to protect access to a per-CPU zone cache.  We should be
+		 * migrating CPUs here to pick up data from each cache
+		 * instead.  Because this locking no longer protects other
+		 * use of the cache, we can now get inconsistent results.
+		 */
 		if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
 			for (cpu = 0; cpu <= mp_maxid; cpu++) {
 				if (CPU_ABSENT(cpu))
@@ -2756,6 +2821,7 @@
 		}
 		ZONE_LOCK(z);
 		cachefree = 0;
+		alloc = 0;
 		if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) {
 			for (cpu = 0; cpu <= mp_maxid; cpu++) {
 				if (CPU_ABSENT(cpu))
@@ -2765,9 +2831,15 @@
 					cachefree += cache->uc_allocbucket->ub_cnt;
 				if (cache->uc_freebucket != NULL)
 					cachefree += cache->uc_freebucket->ub_cnt;
+				alloc += cache->uc_allocs;
+				cache->uc_allocs = 0;
 				CPU_UNLOCK(cpu);
 			}
 		}
+
+		/* Now push alloc stats back into zone.  */
+		z->uz_allocs += alloc;
+
 		LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) {
 			cachefree += bucket->ub_cnt;
 		}