--- //depot/vendor/freebsd/src/sys/kern/kern_malloc.c 2005/04/12 23:55:38 +++ //depot/user/rwatson/percpu/sys/kern/kern_malloc.c 2005/04/17 18:30:41 @@ -1,4 +1,5 @@ /*- + * Copyright (c) 2005 Robert N. M. Watson * Copyright (c) 1987, 1991, 1993 * The Regents of the University of California. All rights reserved. * @@ -44,6 +45,7 @@ #include #include #include +#include #include #include @@ -133,6 +135,33 @@ {0, NULL}, }; +/* + * Two malloc type structures are present: malloc_type, which is used by a + * type owner to declare the type, and malloc_type_internal, which holds + * malloc-owned statistics and other ABI-sensitive fields, such as the set of + * malloc statistics indexed by the compile-time MAXCPU constant. + * + * The malloc_type ks_next field is protected by malloc_mtx. Other fields in + * malloc_type are static after initialization so unsynchronized. + * + * Statistics in malloc_type_stats are written only when holding a critical + * section, but read lock-free resulting in possible (minor) races, which the + * monitoring app should take into account. + */ +struct malloc_type_stats { + u_long mts_memalloced; /* Bytes allocated on CPU. */ + u_long mts_memfreed; /* Bytes freed on CPU. */ + u_long mts_numallocs; /* Number of allocates on CPU. */ + u_long mts_numfrees; /* Number of frees on CPU. */ + u_long mts_size; /* Bitmask of sizes allocated on CPU. */ +}; + +struct malloc_type_internal { + struct malloc_type_stats mti_stats[MAXCPU]; +}; + +uma_zone_t mt_zone; + #ifdef DEBUG_MEMGUARD u_int vm_memguard_divisor; SYSCTL_UINT(_vm, OID_AUTO, memguard_divisor, CTLFLAG_RD, &vm_memguard_divisor, @@ -197,41 +226,44 @@ * Add this to the informational malloc_type bucket. */ static void -malloc_type_zone_allocated(struct malloc_type *ksp, unsigned long size, +malloc_type_zone_allocated(struct malloc_type *type, unsigned long size, int zindx) { - mtx_lock(&ksp->ks_mtx); - ksp->ks_calls++; + struct malloc_type_internal *mti; + struct malloc_type_stats *mts; + + critical_enter(); + mti = (struct malloc_type_internal *)(type->ks_handle); + mts = &mti->mti_stats[curcpu]; + mts->mts_memalloced += size; + mts->mts_numallocs++; if (zindx != -1) - ksp->ks_size |= 1 << zindx; - if (size != 0) { - ksp->ks_memuse += size; - ksp->ks_inuse++; - if (ksp->ks_memuse > ksp->ks_maxused) - ksp->ks_maxused = ksp->ks_memuse; - } - mtx_unlock(&ksp->ks_mtx); + mts->mts_size |= 1 << zindx; + critical_exit(); } void -malloc_type_allocated(struct malloc_type *ksp, unsigned long size) +malloc_type_allocated(struct malloc_type *type, unsigned long size) { - malloc_type_zone_allocated(ksp, size, -1); + + malloc_type_zone_allocated(type, size, -1); } /* * Remove this allocation from the informational malloc_type bucket. */ void -malloc_type_freed(struct malloc_type *ksp, unsigned long size) +malloc_type_freed(struct malloc_type *type, unsigned long size) { - mtx_lock(&ksp->ks_mtx); - KASSERT(size <= ksp->ks_memuse, - ("malloc(9)/free(9) confusion.\n%s", - "Probably freeing with wrong type, but maybe not here.")); - ksp->ks_memuse -= size; - ksp->ks_inuse--; - mtx_unlock(&ksp->ks_mtx); + struct malloc_type_internal *mti; + struct malloc_type_stats *mts; + + critical_enter(); + mti = (struct malloc_type_internal *)type->ks_handle; + mts = &mti->mti_stats[curcpu]; + mts->mts_memfreed += size; + mts->mts_numfrees++; + critical_exit(); } /* @@ -351,9 +383,6 @@ } #endif - KASSERT(type->ks_memuse > 0, - ("malloc(9)/free(9) confusion.\n%s", - "Probably freeing with wrong type, but maybe not here.")); size = 0; slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK)); @@ -405,6 +434,11 @@ if (addr == NULL) return (malloc(size, type, flags)); + /* + * XXX: Should report free of old memory and alloc of new memory to + * per-CPU stats. + */ + #ifdef DEBUG_MEMGUARD /* XXX: CHANGEME! */ if (type == M_SUBPROC) { @@ -543,6 +577,13 @@ uma_startup2(); + mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal), +#ifdef INVARIANTS + mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini, +#else + NULL, NULL, NULL, NULL, +#endif + UMA_ALIGN_PTR, UMA_ZONE_MALLOC); for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) { int size = kmemzones[indx].kz_size; char *name = kmemzones[indx].kz_name; @@ -562,127 +603,145 @@ } void -malloc_init(void *data) +malloc_init(void *type) { - struct malloc_type *type = (struct malloc_type *)data; + struct malloc_type_internal *mti; + struct malloc_type *mt; - mtx_lock(&malloc_mtx); - if (type->ks_magic != M_MAGIC) - panic("malloc type lacks magic"); + KASSERT(cnt.v_page_count != 0, ("malloc_register before vm_init")); - if (cnt.v_page_count == 0) - panic("malloc_init not allowed before vm init"); + mt = type; + mti = uma_zalloc(mt_zone, M_WAITOK | M_ZERO); + mt->ks_handle = mti; - if (type->ks_next != NULL) - return; - - type->ks_next = kmemstatistics; + mtx_lock(&malloc_mtx); + mt->ks_next = kmemstatistics; kmemstatistics = type; - mtx_init(&type->ks_mtx, type->ks_shortdesc, "Malloc Stats", MTX_DEF); mtx_unlock(&malloc_mtx); } void -malloc_uninit(void *data) +malloc_uninit(void *type) { - struct malloc_type *type = (struct malloc_type *)data; - struct malloc_type *t; + struct malloc_type_internal *mti; + struct malloc_type *mt, *temp; + mt = type; + KASSERT(mt->ks_handle != NULL, ("malloc_deregister: cookie NULL")); mtx_lock(&malloc_mtx); - mtx_lock(&type->ks_mtx); - if (type->ks_magic != M_MAGIC) - panic("malloc type lacks magic"); - - if (cnt.v_page_count == 0) - panic("malloc_uninit not allowed before vm init"); - - if (type == kmemstatistics) - kmemstatistics = type->ks_next; - else { - for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) { - if (t->ks_next == type) { - t->ks_next = type->ks_next; - break; - } + mti = mt->ks_handle; + mt->ks_handle = NULL; + if (mt != kmemstatistics) { + for (temp = kmemstatistics; temp != NULL; + temp = temp->ks_next) { + if (temp->ks_next == mt) + temp->ks_next = mt->ks_next; } - } - type->ks_next = NULL; - mtx_destroy(&type->ks_mtx); + } else + kmemstatistics = mt->ks_next; mtx_unlock(&malloc_mtx); + uma_zfree(mt_zone, type); } static int sysctl_kern_malloc(SYSCTL_HANDLER_ARGS) { + struct malloc_type_stats *mts, mts_local; + struct malloc_type_internal *mti; + long temp_allocs, temp_bytes; struct malloc_type *type; int linesize = 128; - int curline; + struct sbuf sbuf; int bufsize; int first; int error; char *buf; - char *p; int cnt; - int len; int i; cnt = 0; + /* Guess at how much room is needed. */ mtx_lock(&malloc_mtx); for (type = kmemstatistics; type != NULL; type = type->ks_next) cnt++; + mtx_unlock(&malloc_mtx); - mtx_unlock(&malloc_mtx); bufsize = linesize * (cnt + 1); - p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); + buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); + sbuf_new(&sbuf, buf, bufsize, SBUF_FIXEDLEN); + mtx_lock(&malloc_mtx); - len = snprintf(p, linesize, + + sbuf_printf(&sbuf, "\n Type InUse MemUse HighUse Requests Size(s)\n"); - p += len; - for (type = kmemstatistics; cnt != 0 && type != NULL; type = type->ks_next, cnt--) { - if (type->ks_calls == 0) + mti = type->ks_handle; + bzero(&mts_local, sizeof(mts_local)); + for (i = 0; i < MAXCPU; i++) { + mts = &mti->mti_stats[i]; + mts_local.mts_memalloced += mts->mts_memalloced; + mts_local.mts_memfreed += mts->mts_memfreed; + mts_local.mts_numallocs += mts->mts_numallocs; + mts_local.mts_numfrees += mts->mts_numfrees; + mts_local.mts_size |= mts->mts_size; + } + if (mts_local.mts_numallocs == 0) continue; - curline = linesize - 2; /* Leave room for the \n */ - len = snprintf(p, curline, "%13s%6lu%6luK%7luK%9llu", - type->ks_shortdesc, - type->ks_inuse, - (type->ks_memuse + 1023) / 1024, - (type->ks_maxused + 1023) / 1024, - (long long unsigned)type->ks_calls); - curline -= len; - p += len; + /* + * Due to races in per-CPU statistics gather, it's possible to + * get a slightly negative number here. If we do, approximate + * with 0. + */ + if (mts_local.mts_numallocs > mts_local.mts_numfrees) + temp_allocs = mts_local.mts_numallocs - + mts_local.mts_numfrees; + else + temp_allocs = 0; + + /* + * Ditto for bytes allocated. + */ + if (mts_local.mts_memalloced > mts_local.mts_memfreed) + temp_bytes = mts_local.mts_memalloced - + mts_local.mts_memfreed; + else + temp_bytes = 0; + + /* + * XXXRW: High-waterwark is no longer easily available, so + * we just print '-' for that column. + */ + sbuf_printf(&sbuf, "%13s%6lu%6luK -%9lu", + type->ks_shortdesc, + temp_allocs, + (temp_bytes + 1023) / 1024, + mts_local.mts_numallocs); first = 1; for (i = 0; i < sizeof(kmemzones) / sizeof(kmemzones[0]) - 1; i++) { - if (type->ks_size & (1 << i)) { + if (mts_local.mts_size & (1 << i)) { if (first) - len = snprintf(p, curline, " "); + sbuf_printf(&sbuf, " "); else - len = snprintf(p, curline, ","); - curline -= len; - p += len; - - len = snprintf(p, curline, - "%s", kmemzones[i].kz_name); - curline -= len; - p += len; - + sbuf_printf(&sbuf, ","); + sbuf_printf(&sbuf, "%s", + kmemzones[i].kz_name); first = 0; } } - - len = snprintf(p, 2, "\n"); - p += len; + sbuf_printf(&sbuf, "\n"); } + sbuf_finish(&sbuf); + mtx_unlock(&malloc_mtx); - mtx_unlock(&malloc_mtx); - error = SYSCTL_OUT(req, buf, p - buf); + error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf)); + sbuf_delete(&sbuf); free(buf, M_TEMP); return (error); } @@ -696,6 +755,7 @@ sysctl_kern_mprof(SYSCTL_HANDLER_ARGS) { int linesize = 64; + struct sbuf sbuf; uint64_t count; uint64_t waste; uint64_t mem; @@ -704,7 +764,6 @@ char *buf; int rsize; int size; - char *p; int len; int i; @@ -714,34 +773,30 @@ waste = 0; mem = 0; - p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); - len = snprintf(p, bufsize, + buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); + sbuf_new(&sbuf, buf, bufsize, SBUF_FIXEDLEN); + sbuf_printf(&sbuf, "\n Size Requests Real Size\n"); - bufsize -= len; - p += len; - for (i = 0; i < KMEM_ZSIZE; i++) { size = i << KMEM_ZSHIFT; rsize = kmemzones[kmemsize[i]].kz_size; count = (long long unsigned)krequests[i]; - len = snprintf(p, bufsize, "%6d%28llu%11d\n", - size, (unsigned long long)count, rsize); - bufsize -= len; - p += len; + sbuf_printf(&sbuf, "%6d%28llu%11d\n", size, + (unsigned long long)count, rsize); if ((rsize * count) > (size * count)) waste += (rsize * count) - (size * count); mem += (rsize * count); } - - len = snprintf(p, bufsize, + sbuf_printf(&sbuf, "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n", (unsigned long long)mem, (unsigned long long)waste); - p += len; + sbuf_finish(&sbuf); - error = SYSCTL_OUT(req, buf, p - buf); + error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf)); + sbuf_delete(&sbuf); free(buf, M_TEMP); return (error); } --- //depot/vendor/freebsd/src/sys/kern/kern_mbuf.c 2005/02/16 21:50:29 +++ //depot/user/rwatson/percpu/sys/kern/kern_mbuf.c 2005/04/15 11:11:26 @@ -1,6 +1,7 @@ /*- - * Copyright (c) 2004, 2005, - * Bosko Milekic . All rights reserved. + * Copyright (c) 2004, 2005 Bosko Milekic + * Copyright (c) 2005 Robert N. M. Watson + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,6 +32,9 @@ #include "opt_mac.h" #include "opt_param.h" +/* Need mbstat_percpu definition from mbuf.h. */ +#define WANT_MBSTAT_PERCPU + #include #include #include @@ -39,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -79,7 +84,18 @@ */ int nmbclusters; + +/* + * mbstat is the mbuf statistics structure exposed to userspace. + * + * mbstat_percpu is the per-CPU statistics structure in which many of the + * mbstat measurements are gathered before being combined for exposure to + * userspace. mbstat_percpu is read lockless, so subject to small + * consistency races. It is modified holding a critical section to avoid + * read-modify-write races in the presence of preemption. + */ struct mbstat mbstat; +struct mbstat_percpu mbstat_percpu[MAXCPU]; static void tunable_mbinit(void *dummy) @@ -91,11 +107,13 @@ } SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL); +static int sysctl_kern_ipc_mbstat(SYSCTL_HANDLER_ARGS); + SYSCTL_DECL(_kern_ipc); SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0, "Maximum number of mbuf clusters allowed"); -SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat, - "Mbuf general information and statistics"); +SYSCTL_PROC(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, NULL, 0, + sysctl_kern_ipc_mbstat, "", "Mbuf general information and statistics"); /* * Zones from which we allocate. @@ -170,8 +188,69 @@ mbstat.m_mcfail = mbstat.m_mpfail = 0; mbstat.sf_iocnt = 0; mbstat.sf_allocwait = mbstat.sf_allocfail = 0; + + /* mbstat_percpu is zero'd by BSS. */ } +static int +sysctl_kern_ipc_mbstat(SYSCTL_HANDLER_ARGS) +{ + struct mbstat_percpu *mbp, mbp_local; + u_char cpu; + + bzero(&mbp_local, sizeof(mbp_local)); + for (cpu = 0; cpu < MAXCPU; cpu++) { + mbp = &mbstat_percpu[cpu]; + mbp_local.mbp_mbuf_allocs += mbp->mbp_mbuf_allocs; + mbp_local.mbp_mbuf_frees += mbp->mbp_mbuf_frees; + mbp_local.mbp_mbuf_fails += mbp->mbp_mbuf_fails; + mbp_local.mbp_mbuf_drains += mbp->mbp_mbuf_drains; + mbp_local.mbp_clust_allocs += mbp->mbp_clust_allocs; + mbp_local.mbp_clust_frees += mbp->mbp_clust_frees; + + mbp_local.mbp_copy_fails += mbp->mbp_copy_fails; + mbp_local.mbp_pullup_fails += mbp->mbp_pullup_fails; + + mbp_local.sfp_iocnt += mbp->sfp_iocnt; + mbp_local.sfp_alloc_fails += mbp->sfp_alloc_fails; + mbp_local.sfp_alloc_waits += mbp->sfp_alloc_waits; + } + + /* + * If, due to races, the number of frees for mbufs or clusters is + * greater than the number of allocs, adjust alloc stats to 0. This + * isn't quite accurate, but for the time being, we consider the + * performance win of races worth the occasional inaccuracy. + */ + if (mbp_local.mbp_mbuf_allocs > mbp_local.mbp_mbuf_frees) + mbstat.m_mbufs = mbp_local.mbp_mbuf_allocs - + mbp_local.mbp_mbuf_frees; + else + mbstat.m_mbufs = 0; + + if (mbp_local.mbp_clust_allocs > mbp_local.mbp_clust_frees) + mbstat.m_mclusts = mbp_local.mbp_clust_allocs - + mbp_local.mbp_clust_frees; + else + mbstat.m_mclusts = 0; + + mbstat.m_drain = mbp_local.mbp_mbuf_drains; + mbstat.m_mcfail = mbp_local.mbp_copy_fails; + mbstat.m_mpfail = mbp_local.mbp_pullup_fails; + + mbstat.sf_iocnt = mbp_local.sfp_iocnt; + mbstat.sf_allocfail = mbp_local.sfp_alloc_fails; + /* + * sf_allocwait is protected by per-architecture mutex sf_buf_lock, + * which is held whenever sf_allocwait is updated, so don't use the + * per-cpu version here + * + * mbstat.sf_allocwait = mbp_local.sfp_alloc_waits; + */ + + return (SYSCTL_OUT(req, &mbstat, sizeof(mbstat))); +} + /* * Constructor for Mbuf master zone. * @@ -212,7 +291,10 @@ #endif } else m->m_data = m->m_dat; - mbstat.m_mbufs += 1; /* XXX */ + + critical_enter(); + mbstat_percpu[curcpu].mbp_mbuf_allocs++; + critical_exit(); return (0); } @@ -227,7 +309,9 @@ m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); - mbstat.m_mbufs -= 1; /* XXX */ + critical_enter(); + mbstat_percpu[curcpu].mbp_mbuf_frees++; + critical_exit(); } /* XXX Only because of stats */ @@ -235,12 +319,16 @@ mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; + u_char cpu; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); - mbstat.m_mbufs -= 1; /* XXX */ - mbstat.m_mclusts -= 1; /* XXX */ + critical_enter(); + cpu = curcpu; + mbstat_percpu[cpu].mbp_mbuf_frees++; + mbstat_percpu[cpu].mbp_clust_frees++; + critical_exit(); } /* @@ -263,7 +351,9 @@ m->m_ext.ext_size = MCLBYTES; m->m_ext.ext_type = EXT_CLUSTER; m->m_ext.ref_cnt = NULL; /* Lazy counter assign. */ - mbstat.m_mclusts += 1; /* XXX */ + critical_enter(); + mbstat_percpu[curcpu].mbp_clust_allocs++; + critical_exit(); return (0); } @@ -271,7 +361,10 @@ static void mb_dtor_clust(void *mem, int size, void *arg) { - mbstat.m_mclusts -= 1; /* XXX */ + + critical_enter(); + mbstat_percpu[curcpu].mbp_clust_frees++; + critical_exit(); } /* @@ -288,7 +381,9 @@ uma_zalloc_arg(zone_clust, m, how); if (m->m_ext.ext_buf == NULL) return (ENOMEM); - mbstat.m_mclusts -= 1; /* XXX */ + critical_enter(); + mbstat_percpu[curcpu].mbp_clust_frees++; + critical_exit(); return (0); } @@ -304,7 +399,9 @@ m = (struct mbuf *)mem; uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); m->m_ext.ext_buf = NULL; - mbstat.m_mclusts += 1; /* XXX */ + critical_enter(); + mbstat_percpu[curcpu].mbp_clust_allocs++; + critical_exit(); } /* @@ -320,6 +417,7 @@ #endif int flags; short type; + u_char cpu; m = (struct mbuf *)mem; args = (struct mb_args *)arg; @@ -348,8 +446,11 @@ return (error); #endif } - mbstat.m_mbufs += 1; /* XXX */ - mbstat.m_mclusts += 1; /* XXX */ + critical_enter(); + cpu = curcpu; + mbstat_percpu[cpu].mbp_mbuf_allocs++; + mbstat_percpu[cpu].mbp_clust_allocs++; + critical_exit(); return (0); } @@ -369,7 +470,9 @@ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, "mb_reclaim()"); - mbstat.m_drain++; + critical_enter(); + mbstat_percpu[curcpu].mbp_mbuf_drains++; + critical_exit(); for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) --- //depot/vendor/freebsd/src/sys/kern/uipc_mbuf.c 2005/03/17 19:35:19 +++ //depot/user/rwatson/percpu/sys/kern/uipc_mbuf.c 2005/04/15 10:55:44 @@ -36,6 +36,9 @@ #include "opt_param.h" #include "opt_mbuf_stress_test.h" +/* Need mbstat_percpu definition from mbuf.h. */ +#define WANT_MBSTAT_PERCPU + #include #include #include @@ -44,8 +47,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -428,13 +433,18 @@ m = m->m_next; np = &n->m_next; } - if (top == NULL) - mbstat.m_mcfail++; /* XXX: No consistency. */ + if (top == NULL) { + critical_enter(); + mbstat_percpu[curcpu].mbp_copy_fails++; + critical_exit(); + } return (top); nospace: m_freem(top); - mbstat.m_mcfail++; /* XXX: No consistency. */ + critical_enter(); + mbstat_percpu[curcpu].mbp_copy_fails++; + critical_exit(); return (NULL); } @@ -497,7 +507,9 @@ return top; nospace: m_freem(top); - mbstat.m_mcfail++; /* XXX: No consistency. */ + critical_enter(); + mbstat_percpu[curcpu].mbp_copy_fails++; + critical_exit(); return (NULL); } @@ -600,7 +612,9 @@ nospace: m_freem(top); - mbstat.m_mcfail++; /* XXX: No consistency. */ + critical_enter(); + mbstat_percpu[curcpu].mbp_copy_fails++; + critical_exit(); return (NULL); } @@ -762,7 +776,9 @@ return (m); bad: m_freem(n); - mbstat.m_mpfail++; /* XXX: No consistency. */ + critical_enter(); + mbstat_percpu[curcpu].mbp_pullup_fails++; + critical_exit(); return (NULL); } --- //depot/vendor/freebsd/src/sys/kern/uipc_syscalls.c 2005/04/16 18:50:30 +++ //depot/user/rwatson/percpu/sys/kern/uipc_syscalls.c 2005/04/25 10:22:44 @@ -39,6 +39,9 @@ #include "opt_ktrace.h" #include "opt_mac.h" +/* Need mbstat_percpu definition from mbuf.h. */ +#define WANT_MBSTAT_PERCPU + #include #include #include @@ -1933,7 +1936,9 @@ vm_page_io_finish(pg); if (!error) VM_OBJECT_UNLOCK(obj); - mbstat.sf_iocnt++; + critical_enter(); + mbstat_percpu[curcpu].sfp_iocnt++; + critical_exit(); } if (error) { @@ -1961,7 +1966,9 @@ * but this wait can be interrupted. */ if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) { - mbstat.sf_allocfail++; + critical_enter(); + mbstat_percpu[curcpu].sfp_alloc_fails++; + critical_exit(); vm_page_lock_queues(); vm_page_unwire(pg, 0); if (pg->wire_count == 0 && pg->object == NULL) --- //depot/vendor/freebsd/src/sys/sys/malloc.h 2005/01/07 02:32:16 +++ //depot/user/rwatson/percpu/sys/sys/malloc.h 2005/04/14 12:54:00 @@ -50,25 +50,51 @@ #define M_MAGIC 877983977 /* time when first defined :-) */ +/* + * ABI-compatible version of the old 'struct malloc_type', only all stats are + * now malloc-managed in malloc-owned memory rather than in caller memory, so + * as to avoid ABI issues. The ks_next pointer is reused as a pointer to the + * internal data handle. + * + * XXXRW: Why is this not ifdef _KERNEL? + * + * XXXRW: Use of ks_shortdesc has leaked out of kern_malloc.c. + */ struct malloc_type { - struct malloc_type *ks_next; /* next in list */ - u_long ks_memuse; /* total memory held in bytes */ - u_long ks_size; /* sizes of this thing that are allocated */ - u_long ks_inuse; /* # of packets of this type currently in use */ - uint64_t ks_calls; /* total packets of this type ever allocated */ - u_long ks_maxused; /* maximum number ever used */ - u_long ks_magic; /* if it's not magic, don't touch it */ - const char *ks_shortdesc; /* short description */ - struct mtx ks_mtx; /* lock for stats */ + struct malloc_type *ks_next; /* Next in global chain. */ + u_long _ks_size; /* No longer used. */ + u_long _ks_inuse; /* No longer used. */ + uint64_t _ks_calls; /* No longer used. */ + u_long _ks_maxused; /* No longer used. */ + u_long ks_magic; /* Detect programmer error. */ + const char *ks_shortdesc; /* Printable type name. */ + + /* + * struct malloc_type was terminated with a struct mtx, which is no + * longer required. For ABI reasons, continue to flesh out the full + * size of the old structure, but reuse the _lo_class field for our + * internal data handle. + */ + void *ks_handle; /* Priv. data, was lo_class. */ + const char *_lo_name; + const char *_lo_type; + u_int _lo_flags; + void *_lo_list_next; + struct witness *_lo_witness; + uintptr_t _mtx_lock; + u_int _mtx_recurse; }; #ifdef _KERNEL -#define MALLOC_DEFINE(type, shortdesc, longdesc) \ - struct malloc_type type[1] = { \ - { NULL, 0, 0, 0, 0, 0, M_MAGIC, shortdesc, {} } \ - }; \ - SYSINIT(type##_init, SI_SUB_KMEM, SI_ORDER_SECOND, malloc_init, type); \ - SYSUNINIT(type##_uninit, SI_SUB_KMEM, SI_ORDER_ANY, malloc_uninit, type) +#define MALLOC_DEFINE(type, shortdesc, longdesc) \ + struct malloc_type type[1] = { \ + { NULL, 0, 0, 0, 0, M_MAGIC, shortdesc, NULL, NULL, \ + NULL, 0, NULL, NULL, 0, 0 } \ + }; \ + SYSINIT(type##_init, SI_SUB_KMEM, SI_ORDER_SECOND, malloc_init, \ + type); \ + SYSUNINIT(type##_uninit, SI_SUB_KMEM, SI_ORDER_ANY, \ + malloc_uninit, type); #define MALLOC_DECLARE(type) \ extern struct malloc_type type[1] @@ -112,6 +138,7 @@ int flags); void *reallocf(void *addr, unsigned long size, struct malloc_type *type, int flags); + #endif /* _KERNEL */ #endif /* !_SYS_MALLOC_H_ */ --- //depot/vendor/freebsd/src/sys/sys/mbuf.h 2005/03/17 19:35:19 +++ //depot/user/rwatson/percpu/sys/sys/mbuf.h 2005/04/15 10:55:44 @@ -243,6 +243,29 @@ #define MT_NTYPES 16 /* number of mbuf types for mbtypes[] */ /* + * Per-CPU mbuf allocator statistics, which are collated to construct the + * global statistics. They are read lockless, but written to while in a + * critical section to prevent read-modify-write races. + * + * XXXRW: As with comments below, maybe sendfile stats should be elsesewhere. + */ +struct mbstat_percpu { + u_long mbp_mbuf_allocs; /* mbufs alloc'd on CPU. */ + u_long mbp_mbuf_frees; /* mbufs freed on CPU. */ + u_long mbp_mbuf_fails; /* mbuf alloc failures on CPU. */ + u_long mbp_mbuf_drains; /* mbuf drains on CPU .*/ + u_long mbp_clust_allocs; /* clusters alloc'd on CPU. */ + u_long mbp_clust_frees; /* clusters freed on CPU. */ + + u_long mbp_copy_fails; /* mbuf copy failures on CPU. */ + u_long mbp_pullup_fails; /* mbuf pullup failures on CPU. */ + + u_long sfp_iocnt; /* sendfile I/O's on CPU. */ + u_long sfp_alloc_fails; /* sendfile alloc failures on CPU. */ + u_long sfp_alloc_waits; /* sendfile alloc waits on CPU. */ +}; + +/* * General mbuf allocator statistics structure. */ struct mbstat { @@ -550,6 +573,15 @@ extern struct mbstat mbstat; /* General mbuf stats/infos */ extern int nmbclusters; /* Maximum number of clusters */ +/* + * Avoid exposing PERCPU definition outside of a very limited set of files, + * so that the compile-time value of PERCPU doesn't become part of the + * exposed kernel ABI. + */ +#ifdef WANT_MBSTAT_PERCPU +extern struct mbstat_percpu mbstat_percpu[MAXCPU]; +#endif + struct uio; void m_adj(struct mbuf *, int); --- //depot/vendor/freebsd/src/sys/sys/pcpu.h 2005/01/07 02:32:16 +++ //depot/user/rwatson/percpu/sys/sys/pcpu.h 2005/04/17 14:42:36 @@ -81,6 +81,7 @@ extern struct cpuhead cpuhead; #define CURPROC (curthread->td_proc) +#define curcpu PCPU_GET(cpuid) #define curkse (curthread->td_kse) #define curksegrp (curthread->td_ksegrp) #define curproc (curthread->td_proc) --- //depot/vendor/freebsd/src/sys/vm/uma_core.c 2005/02/24 06:30:36 +++ //depot/user/rwatson/percpu/sys/vm/uma_core.c 2005/04/18 12:47:40 @@ -1,4 +1,5 @@ /*- + * Copyright (c) 2004-2005 Robert N. M. Watson * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * Copyright (c) 2002, 2003, 2004, 2005, @@ -119,9 +120,6 @@ /* This mutex protects the keg list */ static struct mtx uma_mtx; -/* These are the pcpu cache locks */ -static struct mtx uma_pcpu_mtx[MAXCPU]; - /* Linked list of boot time pages */ static LIST_HEAD(,uma_slab) uma_boot_pages = LIST_HEAD_INITIALIZER(&uma_boot_pages); @@ -384,48 +382,19 @@ zone_timeout(uma_zone_t zone) { uma_keg_t keg; - uma_cache_t cache; u_int64_t alloc; - int cpu; keg = zone->uz_keg; alloc = 0; /* - * Aggregate per cpu cache statistics back to the zone. - * - * XXX This should be done in the sysctl handler. - * - * I may rewrite this to set a flag in the per cpu cache instead of - * locking. If the flag is not cleared on the next round I will have - * to lock and do it here instead so that the statistics don't get too - * far out of sync. - */ - if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL)) { - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - CPU_LOCK(cpu); - cache = &zone->uz_cpu[cpu]; - /* Add them up, and reset */ - alloc += cache->uc_allocs; - cache->uc_allocs = 0; - CPU_UNLOCK(cpu); - } - } - - /* Now push these stats back into the zone.. */ - ZONE_LOCK(zone); - zone->uz_allocs += alloc; - - /* * Expand the zone hash table. * * This is done if the number of slabs is larger than the hash size. * What I'm trying to do here is completely reduce collisions. This * may be a little aggressive. Should I allow for two collisions max? */ - + ZONE_LOCK(zone); if (keg->uk_flags & UMA_ZONE_HASH && keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) { struct uma_hash newhash; @@ -613,6 +582,10 @@ /* * Drains the per cpu caches for a zone. * + * NOTE: This may only be called while the zone is being turn down, and not + * during normal operation. This is necessary in order that we do not have + * to migrate CPUs to drain the per-CPU caches. + * * Arguments: * zone The zone to drain, must be unlocked. * @@ -626,12 +599,20 @@ int cpu; /* - * We have to lock each cpu cache before locking the zone + * XXX: It is safe to not lock the per-CPU caches, because we're + * tearing down the zone anyway. I.e., there will be no further use + * of the caches at this point. + * + * XXX: It would good to be able to assert that the zone is being + * torn down to prevent improper use of cache_drain(). + * + * XXX: We lock the zone before passing into bucket_cache_drain() as + * it is used elsewhere. Should the tear-down path be made special + * there in some form? */ for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) continue; - CPU_LOCK(cpu); cache = &zone->uz_cpu[cpu]; bucket_drain(zone, cache->uc_allocbucket); bucket_drain(zone, cache->uc_freebucket); @@ -644,11 +625,6 @@ ZONE_LOCK(zone); bucket_cache_drain(zone); ZONE_UNLOCK(zone); - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - CPU_UNLOCK(cpu); - } } /* @@ -828,7 +804,8 @@ &flags, wait); if (mem == NULL) { if (keg->uk_flags & UMA_ZONE_OFFPAGE) - uma_zfree_internal(keg->uk_slabzone, slab, NULL, 0); + uma_zfree_internal(keg->uk_slabzone, slab, NULL, + SKIP_NONE); ZONE_LOCK(zone); return (NULL); } @@ -1643,10 +1620,6 @@ #ifdef UMA_DEBUG printf("Initializing pcpu cache locks.\n"); #endif - /* Initialize the pcpu cache lock set once and for all */ - for (i = 0; i <= mp_maxid; i++) - CPU_LOCK_INIT(i); - #ifdef UMA_DEBUG printf("Creating slab and hash zones.\n"); #endif @@ -1793,6 +1766,9 @@ uma_cache_t cache; uma_bucket_t bucket; int cpu; +#ifdef INVARIANTS + int count; +#endif int badness; /* This is the fast path allocation */ @@ -1827,12 +1803,33 @@ } } + /* + * If possible, allocate from the per-CPU cache. There are two + * requirements for safe access to the per-CPU cache: (1) the thread + * accessing the cache must not be preempted or yield during access, + * and (2) the thread must not migrate CPUs without switching which + * cache it accesses. We rely on a critical section to prevent + * preemption and migration. We release the critical section in + * order to acquire the zone mutex if we are unable to allocate from + * the current cache; when we re-acquire the critical section, we + * must detect and handle migration if it has occurred. + */ +#ifdef INVARIANTS + count = 0; +#endif zalloc_restart: - cpu = PCPU_GET(cpuid); - CPU_LOCK(cpu); + critical_enter(); + cpu = curcpu; cache = &zone->uz_cpu[cpu]; zalloc_start: +#ifdef INVARIANTS + count++; + KASSERT(count < 10, ("uma_zalloc_arg: count == 10")); +#endif +#if 0 + critical_assert(); +#endif bucket = cache->uc_allocbucket; if (bucket) { @@ -1845,12 +1842,12 @@ KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); cache->uc_allocs++; + critical_exit(); #ifdef INVARIANTS ZONE_LOCK(zone); uma_dbg_alloc(zone, NULL, item); ZONE_UNLOCK(zone); #endif - CPU_UNLOCK(cpu); if (zone->uz_ctor != NULL) { if (zone->uz_ctor(item, zone->uz_keg->uk_size, udata, flags) != 0) { @@ -1880,7 +1877,33 @@ } } } + /* + * Attempt to retrieve the item from the per-CPU cache has failed, so + * we must go back to the zone. This requires the zone lock, so we + * must drop the critical section, then re-acquire it when we go back + * to the cache. Since the critical section is released, we may be + * preempted or migrate. As such, make sure not to maintain any + * thread-local state specific to the cache from prior to releasing + * the critical section. + */ + critical_exit(); ZONE_LOCK(zone); + critical_enter(); + cpu = curcpu; + cache = &zone->uz_cpu[cpu]; + bucket = cache->uc_allocbucket; + if (bucket != NULL) { + if (bucket->ub_cnt > 0) { + ZONE_UNLOCK(zone); + goto zalloc_start; + } + bucket = cache->uc_freebucket; + if (bucket != NULL && bucket->ub_cnt > 0) { + ZONE_UNLOCK(zone); + goto zalloc_start; + } + } + /* Since we have locked the zone we may as well send back our stats */ zone->uz_allocs += cache->uc_allocs; cache->uc_allocs = 0; @@ -1904,8 +1927,8 @@ ZONE_UNLOCK(zone); goto zalloc_start; } - /* We are no longer associated with this cpu!!! */ - CPU_UNLOCK(cpu); + /* We are no longer associated with this CPU. */ + critical_exit(); /* Bump up our uz_count so we get here less */ if (zone->uz_count < BUCKET_MAX) @@ -2228,10 +2251,10 @@ uma_bucket_t bucket; int bflags; int cpu; - enum zfreeskip skip; +#ifdef INVARIANTS + int count; +#endif - /* This is the fast path free */ - skip = SKIP_NONE; keg = zone->uz_keg; #ifdef UMA_DEBUG_ALLOC_1 @@ -2240,25 +2263,50 @@ CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread, zone->uz_name); + if (zone->uz_dtor) + zone->uz_dtor(item, keg->uk_size, udata); +#ifdef INVARIANTS + ZONE_LOCK(zone); + if (keg->uk_flags & UMA_ZONE_MALLOC) + uma_dbg_free(zone, udata, item); + else + uma_dbg_free(zone, NULL, item); + ZONE_UNLOCK(zone); +#endif /* * The race here is acceptable. If we miss it we'll just have to wait * a little longer for the limits to be reset. */ - if (keg->uk_flags & UMA_ZFLAG_FULL) goto zfree_internal; - if (zone->uz_dtor) { - zone->uz_dtor(item, keg->uk_size, udata); - skip = SKIP_DTOR; - } - +#ifdef INVARIANTS + count = 0; +#endif + /* + * If possible, free to the per-CPU cache. There are two + * requirements for safe access to the per-CPU cache: (1) the thread + * accessing the cache must not be preempted or yield during access, + * and (2) the thread must not migrate CPUs without switching which + * cache it accesses. We rely on a critical section to prevent + * preemption and migration. We release the critical section in + * order to acquire the zone mutex if we are unable to free to the + * current cache; when we re-acquire the critical section, we must + * detect and handle migration if it has occurred. + */ zfree_restart: - cpu = PCPU_GET(cpuid); - CPU_LOCK(cpu); + critical_enter(); + cpu = curcpu; cache = &zone->uz_cpu[cpu]; zfree_start: +#ifdef INVARIANTS + count++; + KASSERT(count < 10, ("uma_zfree_arg: count == 10")); +#endif +#if 0 + critical_assert(); +#endif bucket = cache->uc_freebucket; if (bucket) { @@ -2272,15 +2320,7 @@ ("uma_zfree: Freeing to non free bucket index.")); bucket->ub_bucket[bucket->ub_cnt] = item; bucket->ub_cnt++; -#ifdef INVARIANTS - ZONE_LOCK(zone); - if (keg->uk_flags & UMA_ZONE_MALLOC) - uma_dbg_free(zone, udata, item); - else - uma_dbg_free(zone, NULL, item); - ZONE_UNLOCK(zone); -#endif - CPU_UNLOCK(cpu); + critical_exit(); return; } else if (cache->uc_allocbucket) { #ifdef UMA_DEBUG_ALLOC @@ -2304,9 +2344,32 @@ * * 1) The buckets are NULL * 2) The alloc and free buckets are both somewhat full. + * + * We must go back the zone, which requires acquiring the zone lock, + * which in turn means we must release and re-acquire the critical + * section. Since the critical section is released, we may be + * preempted or migrate. As such, make sure not to maintain any + * thread-local state specific to the cache from prior to releasing + * the critical section. */ - + critical_exit(); ZONE_LOCK(zone); + critical_enter(); + cpu = curcpu; + cache = &zone->uz_cpu[cpu]; + if (cache->uc_freebucket != NULL) { + if (cache->uc_freebucket->ub_cnt < + cache->uc_freebucket->ub_entries) { + ZONE_UNLOCK(zone); + goto zfree_start; + } + if (cache->uc_allocbucket != NULL && + (cache->uc_allocbucket->ub_cnt < + cache->uc_freebucket->ub_cnt)) { + ZONE_UNLOCK(zone); + goto zfree_start; + } + } bucket = cache->uc_freebucket; cache->uc_freebucket = NULL; @@ -2328,8 +2391,8 @@ cache->uc_freebucket = bucket; goto zfree_start; } - /* We're done with this CPU now */ - CPU_UNLOCK(cpu); + /* We are no longer associated with this CPU. */ + critical_exit(); /* And the zone.. */ ZONE_UNLOCK(zone); @@ -2353,26 +2416,8 @@ /* * If nothing else caught this, we'll just do an internal free. */ - zfree_internal: - -#ifdef INVARIANTS - /* - * If we need to skip the dtor and the uma_dbg_free in - * uma_zfree_internal because we've already called the dtor - * above, but we ended up here, then we need to make sure - * that we take care of the uma_dbg_free immediately. - */ - if (skip) { - ZONE_LOCK(zone); - if (keg->uk_flags & UMA_ZONE_MALLOC) - uma_dbg_free(zone, udata, item); - else - uma_dbg_free(zone, NULL, item); - ZONE_UNLOCK(zone); - } -#endif - uma_zfree_internal(zone, item, udata, skip); + uma_zfree_internal(zone, item, udata, SKIP_DTOR); return; } @@ -2655,7 +2700,7 @@ slab->us_flags = flags | UMA_SLAB_MALLOC; slab->us_size = size; } else { - uma_zfree_internal(slabzone, slab, NULL, 0); + uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE); } return (mem); @@ -2666,7 +2711,7 @@ { vsetobj((vm_offset_t)slab->us_data, kmem_object); page_free(slab->us_data, slab->us_size, slab->us_flags); - uma_zfree_internal(slabzone, slab, NULL, 0); + uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE); } void @@ -2743,6 +2788,7 @@ int cachefree; uma_bucket_t bucket; uma_cache_t cache; + u_int64_t alloc; cnt = 0; mtx_lock(&uma_mtx); @@ -2766,15 +2812,9 @@ LIST_FOREACH(z, &zk->uk_zones, uz_link) { if (cnt == 0) /* list may have changed size */ break; - if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) { - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - CPU_LOCK(cpu); - } - } ZONE_LOCK(z); cachefree = 0; + alloc = 0; if (!(zk->uk_flags & UMA_ZFLAG_INTERNAL)) { for (cpu = 0; cpu <= mp_maxid; cpu++) { if (CPU_ABSENT(cpu)) @@ -2784,9 +2824,12 @@ cachefree += cache->uc_allocbucket->ub_cnt; if (cache->uc_freebucket != NULL) cachefree += cache->uc_freebucket->ub_cnt; - CPU_UNLOCK(cpu); + alloc += cache->uc_allocs; + cache->uc_allocs = 0; } } + alloc += z->uz_allocs; + LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link) { cachefree += bucket->ub_cnt; } @@ -2797,7 +2840,7 @@ zk->uk_maxpages * zk->uk_ipers, (zk->uk_ipers * (zk->uk_pages / zk->uk_ppera)) - totalfree, totalfree, - (unsigned long long)z->uz_allocs); + (unsigned long long)alloc); ZONE_UNLOCK(z); for (p = offset + 12; p > offset && *p == ' '; --p) /* nothing */ ; --- //depot/vendor/freebsd/src/sys/vm/uma_int.h 2005/02/16 21:50:29 +++ //depot/user/rwatson/percpu/sys/vm/uma_int.h 2005/03/15 19:57:24 @@ -342,16 +342,6 @@ #define ZONE_LOCK(z) mtx_lock((z)->uz_lock) #define ZONE_UNLOCK(z) mtx_unlock((z)->uz_lock) -#define CPU_LOCK_INIT(cpu) \ - mtx_init(&uma_pcpu_mtx[(cpu)], "UMA pcpu", "UMA pcpu", \ - MTX_DEF | MTX_DUPOK) - -#define CPU_LOCK(cpu) \ - mtx_lock(&uma_pcpu_mtx[(cpu)]) - -#define CPU_UNLOCK(cpu) \ - mtx_unlock(&uma_pcpu_mtx[(cpu)]) - /* * Find a slab within a hash table. This is used for OFFPAGE zones to lookup * the slab structure.