=================================================================== RCS file: /home/ncvs/src/sys/conf/files,v retrieving revision 1.977 diff -u -r1.977 files --- conf/files 30 Dec 2004 23:19:40 -0000 1.977 +++ conf/files 31 Dec 2004 15:32:37 -0000 @@ -1120,6 +1121,7 @@ kern/kern_timeout.c standard kern/kern_umtx.c standard kern/kern_uuid.c standard +kern/kern_waitstat.c standard kern/kern_xxx.c standard kern/link_elf.c standard kern/md4c.c optional netsmb Index: kern/kern_exec.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_exec.c,v retrieving revision 1.260 diff -u -r1.260 kern_exec.c --- kern/kern_exec.c 14 Dec 2004 07:20:03 -0000 1.260 +++ kern/kern_exec.c 31 Dec 2004 17:13:07 -0000 @@ -58,6 +58,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -505,6 +513,9 @@ /* reset caught signals */ execsigs(p); + /* clear any wait stat profiling */ + (void)waitstat_deregister(p); + /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); Index: kern/kern_exit.c =================================================================== RCS file: /home/ncvs/src/sys/kern/kern_exit.c,v retrieving revision 1.252 diff -u -r1.252 kern_exit.c --- kern/kern_exit.c 27 Nov 2004 06:51:37 -0000 1.252 +++ kern/kern_exit.c 31 Dec 2004 17:13:24 -0000 @@ -68,6 +68,7 @@ #ifdef KTRACE #include #endif +#include #include #include @@ -188,6 +189,9 @@ PROC_LOCK(p); _STOPEVENT(p, S_EXIT, rv); wakeup(&p->p_stype); /* Wakeup anyone in procfs' PIOCWAIT */ + + /* Clear process waitstat profiling state. */ + (void)waitstat_deregister(p); PROC_UNLOCK(p); /* =================================================================== RCS file: kern/kern_waitstat.c diff -N kern/kern_waitstat.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ kern/kern_waitstat.c 1 Jan 2005 19:01:11 -0000 @@ -0,0 +1,310 @@ +/*- + * Copyright (c) 2004-2005 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * waitstat is a simple profiling tool to profile where in the kernel a + * process typically waits -- i.e., primarily on synchronization, networking, + * file system I/O, etc. Each process has optional waitstat storage hung off + * of it, as well as a timestamp in each thread used to track entry and exit + * from waits. Sysctls are used to attach wait state monitoring to a + * process, detach it, and query current stats. Access control checks are + * the same as used for debugging since waitstat tracing reveals a lot about + * the internal operations of a process. waitstat state for a process is + * cleared on exec() and exit(), as well as implicitly for the child + * following fork(). + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static MALLOC_DEFINE(M_WAITSTAT, "waitstat", "Per-thread/proc waitstat data"); +static struct mtx waitstat_mtx; +MTX_SYSINIT(waitstat_mtx, &waitstat_mtx, "waitstat_mtx", MTX_SPIN); + +static int +waitstat_register(struct thread *td, pid_t pid) +{ + struct waitstat_proc *wp; + struct proc *targetp; + int error; + + wp = malloc(sizeof(*wp), M_WAITSTAT, M_WAITOK | M_ZERO); + + targetp = pfind(pid); + if (targetp == NULL) { + free(wp, M_WAITSTAT); + return (ESRCH); + } + + error = p_candebug(td, targetp); + if (error) { + PROC_UNLOCK(targetp); + free(wp, M_WAITSTAT); + return (error); + } + + mtx_lock_spin(&waitstat_mtx); + if (targetp->p_waitstat != NULL) { + mtx_unlock_spin(&waitstat_mtx); + PROC_UNLOCK(targetp); + free(wp, M_WAITSTAT); + return (EBUSY); + } + targetp->p_waitstat = wp; + targetp->p_flag |= P_WAITSTAT; + /* + * wp_session_started is left 0 until the first entry to a wait. + * binuptime(&wp->wp_session_started); + */ + mtx_unlock_spin(&waitstat_mtx); + PROC_UNLOCK(targetp); + return (0); +} + +int +waitstat_deregister(struct proc *targetp) +{ + struct waitstat_proc *wp; + + PROC_LOCK_ASSERT(targetp, MA_OWNED); + mtx_lock_spin(&waitstat_mtx); + if (targetp->p_waitstat == NULL) { + mtx_unlock_spin(&waitstat_mtx); + return (EBUSY); + } + wp = targetp->p_waitstat; + targetp->p_waitstat = NULL; + targetp->p_flag &= ~P_WAITSTAT; + mtx_unlock_spin(&waitstat_mtx); + free(wp, M_WAITSTAT); + return (0); +} + +static int +waitstat_userderegister(struct thread *td, pid_t pid) +{ + struct proc *targetp; + int error; + + targetp = pfind(pid); + if (targetp == NULL) + return (ESRCH); + + error = p_candebug(td, targetp); + if (error) { + PROC_UNLOCK(targetp); + return (error); + } + error = waitstat_deregister(targetp); + PROC_UNLOCK(targetp); + return (error); +} + +/* + * Simple live statistics gathering for thread waits. Whenever a thread + * begins to sleep, we grab the current time. + */ +void +waitstat_enter(struct thread *td, struct proc *p, const char *wmesg) +{ + struct waitstat_proc *wp; + + mtx_lock_spin(&waitstat_mtx); + wp = p->p_waitstat; + if (wp != NULL) { + getbinuptime(&td->td_waitstat_time); + td->td_waitstat_wm = wmesg; + /* + * If this is the first time the process has entered a wait, + * then we record the time so that user applications can + * calculate various statistics relative to the start time. + * This carefully omits any "in-progress" wait that was + * going on during attach. + */ + if (wp->wp_session_started.sec == 0) + binuptime(&wp->wp_session_started); + } + mtx_unlock_spin(&waitstat_mtx); +} + +/* + * Simple live statistics gathering for thread waits. Whenever a thread ends + * a sleep, we generate a wait delta given the enter time stamp and update + * the per-process statistics table. + */ +void +waitstat_exit(struct thread *td, struct proc *p) +{ + struct waitstat_channel *wc; + struct waitstat_proc *wp; + struct bintime now; + const char *wmesg; + int i; + + wmesg = td->td_waitstat_wm; + mtx_lock_spin(&waitstat_mtx); + wp = p->p_waitstat; + if (wp == NULL) { + mtx_unlock_spin(&waitstat_mtx); + return; + } + /* + * We ignore any exits from waits where the cached wmesg reference is + * NULL, as it means we didn't see the entrance to the wait. + */ + if (wmesg == NULL) { + mtx_unlock_spin(&waitstat_mtx); + return; + } + binuptime(&now); + bintime_sub(&now, &td->td_waitstat_time); + /* + * Find an existing or new stats block for this wait channel. + */ + for (i = 0; i < WAITSTAT_MAX_STATES; i++) { + wc = &wp->wp_stats[i]; + if (wc->wpc_wmesg[0] == '\0') + break; + if (strncmp(wmesg, wc->wpc_wmesg, WAITSTAT_WMESGLEN) == 0) + break; + } + if (i == WAITSTAT_MAX_STATES) { + wp->wp_overflows++; + mtx_unlock_spin(&waitstat_mtx); + return; + } + /* + * If there's no current wait string in the selected stat entry, then + * this is not a previously seen wait state. + */ + if (wc->wpc_wmesg[0] == '\0') { + strncpy(wc->wpc_wmesg, wmesg, WAITSTAT_WMESGLEN); + wc->wpc_wmesg[WAITSTAT_WMESGLEN] = '\0'; + } + bintime_add(&wc->wpc_totwait, &now); + if (bintime_gt(&now, &wc->wpc_maxwait)) + wc->wpc_maxwait = now; + wc->wpc_numwaits++; + bintime_add(&wp->wp_waited_time, &now); + mtx_unlock_spin(&waitstat_mtx); +} + +static int +sysctl_waitstat_register(SYSCTL_HANDLER_ARGS) +{ + int error, pid; + + pid = 0; + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error) + return (error); + error = waitstat_register(req->td, pid); + return (error); +} + +static int +sysctl_waitstat_deregister(SYSCTL_HANDLER_ARGS) +{ + int error, pid; + + pid = 0; + error = sysctl_handle_int(oidp, &pid, 0, req); + if (error) + return (error); + error = waitstat_userderegister(req->td, pid); + return (error); +} + +static int +sysctl_waitstat_getstat(SYSCTL_HANDLER_ARGS) +{ + struct waitstat_proc *wp, *wp_local; + int error, *name, pid; + struct proc *targetp; + u_int namelen; + + namelen = arg2; + if (namelen != 1) + return (EINVAL); + + wp_local = malloc(sizeof(*wp), M_WAITSTAT, M_WAITOK | M_ZERO); + + name = (int *)arg1; + pid = (pid_t)(name[0]); + targetp = pfind(pid); + if (targetp == NULL) { + free(wp_local, M_WAITSTAT); + return (ESRCH); + } + + error = p_candebug(req->td, targetp); + if (error) { + PROC_UNLOCK(targetp); + return (error); + } + + mtx_lock_spin(&waitstat_mtx); + wp = targetp->p_waitstat; + if (wp != NULL) + *wp_local = *wp; + else + bzero(wp_local, sizeof(*wp_local)); + binuptime(&wp_local->wp_session_current); + mtx_unlock_spin(&waitstat_mtx); + PROC_UNLOCK(targetp); + error = SYSCTL_OUT(req, wp_local, sizeof(*wp_local)); + free(wp_local, M_WAITSTAT); + return (error); +} + +SYSCTL_NODE(_kern, OID_AUTO, waitstat, CTLFLAG_RW, 0, "waitstat facility"); +SYSCTL_PROC(_kern_waitstat, OID_AUTO, register, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_waitstat_register, + "I", "Register a process for waitstat"); +SYSCTL_PROC(_kern_waitstat, OID_AUTO, deregister, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_waitstat_deregister, + "I", "De-register a process for waitstat"); +SYSCTL_NODE(_kern_waitstat, OID_AUTO, stats, CTLFLAG_RD|CTLFLAG_ANYBODY, + sysctl_waitstat_getstat, "Get process waitstat block"); Index: kern/subr_sleepqueue.c =================================================================== RCS file: /home/ncvs/src/sys/kern/subr_sleepqueue.c,v retrieving revision 1.14 diff -u -r1.14 subr_sleepqueue.c --- kern/subr_sleepqueue.c 5 Nov 2004 20:19:58 -0000 1.14 +++ kern/subr_sleepqueue.c 31 Dec 2004 19:56:19 -0000 @@ -76,6 +76,7 @@ #include #include #include +#include /* * Constants for the hash table of sleep queue chains. These constants are @@ -265,6 +266,10 @@ struct thread *td; td = curthread; + + if (td->td_proc->p_flag & P_WAITSTAT) + waitstat_enter(td, td->td_proc, wmesg); + sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(td->td_sleepqueue != NULL); @@ -432,6 +437,8 @@ KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_proc->p_comm); + if (td->td_proc->p_flag & P_WAITSTAT) + waitstat_exit(td, td->td_proc); } /* Index: sys/proc.h =================================================================== RCS file: /home/ncvs/src/sys/sys/proc.h,v retrieving revision 1.415 diff -u -r1.415 proc.h --- sys/proc.h 30 Dec 2004 20:52:44 -0000 1.415 +++ sys/proc.h 31 Dec 2004 19:18:50 -0000 @@ -289,6 +289,8 @@ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ + struct bintime td_waitstat_time; /* (k) bintime for waitstat. */ + const void *td_waitstat_wm; /* (k) wmesg for waitstat. */ #define td_endzero td_base_pri /* Copied during fork1() or thread_sched_upcall(). */ @@ -573,6 +575,7 @@ int p_suspcount; /* (c) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (c) Num threads at user boundary */ + void *p_waitstat; /* (-) Waitstat measurements. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic @@ -634,6 +637,7 @@ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_JAILED 0x1000000 /* Process is in jail. */ +#define P_WAITSTAT 0x2000000 /* Process being waitstat'd. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) Index: sys/time.h =================================================================== RCS file: /home/ncvs/src/sys/sys/time.h,v retrieving revision 1.65 diff -u -r1.65 time.h --- sys/time.h 7 Apr 2004 04:19:49 -0000 1.65 +++ sys/time.h 1 Jan 2005 12:04:35 -0000 @@ -55,6 +55,17 @@ uint64_t frac; }; +static __inline int +bintime_gt(struct bintime *bt1, struct bintime *bt2) +{ + + if (bt1->sec > bt2->sec) + return (1); + if (bt1->frac > bt2->frac) + return (1); + return (0); +} + static __inline void bintime_addx(struct bintime *bt, uint64_t x) { @@ -79,6 +90,14 @@ } static __inline void +bintime_divx(struct bintime *bt, uint64_t x) +{ + + bt->frac /= x; + bt->sec /= x; +} + +static __inline void bintime_sub(struct bintime *bt, const struct bintime *bt2) { uint64_t u; Index: sys/waitstat.h =================================================================== RCS file: sys/waitstat.h diff -N sys/waitstat.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/waitstat.h 1 Jan 2005 19:01:18 -0000 @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 2004-2005 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_WAITSTAT_H_ +#define _SYS_WAITSTAT_H_ + +#define WAITSTAT_WMESGLEN 16 +struct waitstat_channel { + char wpc_wmesg[WAITSTAT_WMESGLEN + 1]; + struct bintime wpc_maxwait; + struct bintime wpc_totwait; + u_int wpc_numwaits; +}; + +#define WAITSTAT_MAX_STATES 1024 +struct waitstat_proc { + struct bintime wp_session_started; + struct bintime wp_session_current; + struct bintime wp_waited_time; + u_int wp_overflows; + struct waitstat_channel wp_stats[WAITSTAT_MAX_STATES]; +}; + +#ifdef _KERNEL +int waitstat_deregister(struct proc *p); +void waitstat_enter(struct thread *td, struct proc *p, const char *wmesg); +void waitstat_exit(struct thread *td, struct proc *p); +#endif + +#endif /* !_SYS_WAITSTAT_H_ */