--- //depot/vendor/freebsd/src/sys/kern/kern_exit.c 2005/11/08 17:15:28 +++ //depot/user/rwatson/ktrace/src/sys/kern/kern_exit.c 2005/11/12 13:53:30 @@ -366,8 +366,13 @@ (void)acct_process(td); #ifdef KTRACE /* - * release trace file + * Drain any pending records on the thread and release the trace + * file. + * + * XXXRW: These should be done atomically, so that there isn't a + * race. Probably via ktrexit(). */ + ktrsysretdrain(); PROC_LOCK(p); mtx_lock(&ktrace_mtx); p->p_traceflag = 0; /* don't trace the vrele() */ --- //depot/vendor/freebsd/src/sys/kern/kern_fork.c 2005/07/01 16:35:11 +++ //depot/user/rwatson/ktrace/src/sys/kern/kern_fork.c 2005/11/03 14:04:09 @@ -272,6 +272,7 @@ mac_init_proc(newproc); #endif knlist_init(&newproc->p_klist, &newproc->p_mtx, NULL, NULL, NULL); + STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ sx_slock(&proctree_lock); --- //depot/vendor/freebsd/src/sys/kern/kern_ktrace.c 2005/11/01 14:50:28 +++ //depot/user/rwatson/ktrace/src/sys/kern/kern_ktrace.c 2005/11/12 13:53:10 @@ -1,6 +1,8 @@ /*- * Copyright (c) 1989, 1993 - * The Regents of the University of California. All rights reserved. + * The Regents of the University of California. + * Copyright (c) 2005 Robert N. M. Watson + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -54,6 +56,25 @@ #include #include +/* + * The ktrace facility allows the tracing of certain key events in user space + * processes, such as system calls, signal delivery, context switches, and + * user generated events using utrace(2). It works by streaming event + * records and data to a vnode associated with the process using the + * ktrace(2) system call. In general, records can be written directly from + * the context that generates the event. One important exception to this is + * during a context switch, where sleeping is not permitted. To handle this + * case, trace events are generated using in-kernel ktr_request records, and + * then delivered to disk at a convenient moment -- either immediately, the + * next traceable event, at system call return, or at process exit. + * + * When dealing with multiple threads or processes writing to the same event + * log, ordering guarantees are weak: specifically, if an event has multiple + * records (i.e., system call enter and return), they may be interlaced with + * records from another event. Process and thread ID information is provided + * in the record, and user applications can de-interlace events if required. + */ + static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE"); #ifdef KTRACE @@ -65,8 +86,6 @@ struct ktr_request { struct ktr_header ktr_header; void *ktr_buffer; - struct ucred *ktr_cred; - struct vnode *ktr_vp; union { struct ktr_syscall ktr_syscall; struct ktr_sysret ktr_sysret; @@ -88,7 +107,6 @@ 0 /* KTR_USER */ }; -static STAILQ_HEAD(, ktr_request) ktr_todo; static STAILQ_HEAD(, ktr_request) ktr_free; static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options"); @@ -104,34 +122,62 @@ static int print_message = 1; struct mtx ktrace_mtx; static struct cv ktrace_cv; +static struct sx ktrace_sx; static void ktrace_init(void *dummy); static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS); static u_int ktrace_resize_pool(u_int newsize); static struct ktr_request *ktr_getrequest(int type); -static void ktr_submitrequest(struct ktr_request *req); +static void ktr_submitrequest(struct thread *td, struct ktr_request *req); static void ktr_freerequest(struct ktr_request *req); -static void ktr_loop(void *dummy); -static void ktr_writerequest(struct ktr_request *req); +static void ktr_writerequest(struct thread *td, struct ktr_request *req); static int ktrcanset(struct thread *,struct proc *); static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *); static int ktrops(struct thread *,struct proc *,int,int,struct vnode *); +/* + * ktrace itself generates events, such as context switches, which we do not + * wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine + * whether or not it is in a region where tracing of events should be + * suppressed. + */ +static void +ktrace_enter(struct thread *td) +{ + + KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set")); + td->td_pflags |= TDP_INKTRACE; +} + +static void +ktrace_exit(struct thread *td) +{ + + KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set")); + td->td_pflags &= ~TDP_INKTRACE; +} + static void +ktrace_assert(struct thread *td) +{ + + KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set")); +} + +static void ktrace_init(void *dummy) { struct ktr_request *req; int i; mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET); + sx_init(&ktrace_sx, "ktrace_sx"); cv_init(&ktrace_cv, "ktrace"); - STAILQ_INIT(&ktr_todo); STAILQ_INIT(&ktr_free); for (i = 0; i < ktr_requestpool; i++) { req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK); STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); } - kthread_create(ktr_loop, NULL, NULL, RFHIGHPID, 0, "ktrace"); } SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL); @@ -154,12 +200,12 @@ if (error) return (error); td = curthread; - td->td_pflags |= TDP_INKTRACE; + ktrace_enter(td); mtx_lock(&ktrace_mtx); oldsize = ktr_requestpool; newsize = ktrace_resize_pool(wantsize); mtx_unlock(&ktrace_mtx); - td->td_pflags &= ~TDP_INKTRACE; + ktrace_exit(td); error = SYSCTL_OUT(req, &oldsize, sizeof(u_int)); if (error) return (error); @@ -214,27 +260,21 @@ struct proc *p = td->td_proc; int pm; - td->td_pflags |= TDP_INKTRACE; - mtx_lock(&ktrace_mtx); + ktrace_enter(td); /* XXX: In caller instead? */ if (!KTRCHECK(td, type)) { - mtx_unlock(&ktrace_mtx); - td->td_pflags &= ~TDP_INKTRACE; + ktrace_exit(td); return (NULL); } + mtx_lock(&ktrace_mtx); req = STAILQ_FIRST(&ktr_free); if (req != NULL) { STAILQ_REMOVE_HEAD(&ktr_free, ktr_list); + mtx_unlock(&ktrace_mtx); req->ktr_header.ktr_type = type; if (p->p_traceflag & KTRFAC_DROP) { req->ktr_header.ktr_type |= KTR_DROP; p->p_traceflag &= ~KTRFAC_DROP; } - KASSERT(p->p_tracevp != NULL, ("ktrace: no trace vnode")); - KASSERT(p->p_tracecred != NULL, ("ktrace: no trace cred")); - req->ktr_vp = p->p_tracevp; - VREF(p->p_tracevp); - req->ktr_cred = crhold(p->p_tracecred); - mtx_unlock(&ktrace_mtx); microtime(&req->ktr_header.ktr_time); req->ktr_header.ktr_pid = p->p_pid; req->ktr_header.ktr_tid = td->td_tid; @@ -248,32 +288,82 @@ mtx_unlock(&ktrace_mtx); if (pm) printf("Out of ktrace request objects.\n"); - td->td_pflags &= ~TDP_INKTRACE; + ktrace_exit(td); } return (req); } +/* + * Some trace generation environments don't permit direct access to VFS, + * such as during a context switch where sleeping is not allowed. Under these + * circumstances, queue a request to the thread to be written asynchronously + * later. + */ static void -ktr_submitrequest(struct ktr_request *req) +ktr_enqueuerequest(struct thread *td, struct ktr_request *req) { mtx_lock(&ktrace_mtx); - STAILQ_INSERT_TAIL(&ktr_todo, req, ktr_list); - cv_signal(&ktrace_cv); + STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list); mtx_unlock(&ktrace_mtx); - curthread->td_pflags &= ~TDP_INKTRACE; + ktrace_exit(td); +} + +/* + * Drain any pending ktrace records from the per-thread queue to disk. This + * is used both internally before committing other records, and also on + * system call return. We drain all the ones we can find at the time when + * drain is requested, but don't keep draining after that as those events + * may me approximately "after" the current event. + */ +static void +ktr_drain(struct thread *td) +{ + struct ktr_request *queued_req; + STAILQ_HEAD(, ktr_request) local_queue; + + ktrace_assert(td); + sx_assert(&ktrace_sx, SX_XLOCKED); + + STAILQ_INIT(&local_queue); /* XXXRW: needed? */ + + if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) { + mtx_lock(&ktrace_mtx); + STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr); + mtx_unlock(&ktrace_mtx); + + while ((queued_req = STAILQ_FIRST(&local_queue))) { + STAILQ_REMOVE_HEAD(&local_queue, ktr_list); + ktr_writerequest(td, queued_req); + ktr_freerequest(queued_req); + } + } +} + +/* + * Submit a trace record for immediate commit to disk -- to be used only + * where entering VFS is OK. First drain any pending records that may have + * been cached in the thread. + */ +static void +ktr_submitrequest(struct thread *td, struct ktr_request *req) +{ + + ktrace_assert(td); + + sx_xlock(&ktrace_sx); + ktr_drain(td); + ktr_writerequest(td, req); + ktr_freerequest(req); + sx_xunlock(&ktrace_sx); + + ktrace_exit(td); } static void ktr_freerequest(struct ktr_request *req) { - crfree(req->ktr_cred); - if (req->ktr_vp != NULL) { - mtx_lock(&Giant); - vrele(req->ktr_vp); - mtx_unlock(&Giant); - } if (req->ktr_buffer != NULL) free(req->ktr_buffer, M_KTRACE); mtx_lock(&ktrace_mtx); @@ -281,38 +371,6 @@ mtx_unlock(&ktrace_mtx); } -static void -ktr_loop(void *dummy) -{ - struct ktr_request *req; - struct thread *td; - struct ucred *cred; - - /* Only cache these values once. */ - td = curthread; - cred = td->td_ucred; - for (;;) { - mtx_lock(&ktrace_mtx); - while (STAILQ_EMPTY(&ktr_todo)) - cv_wait(&ktrace_cv, &ktrace_mtx); - req = STAILQ_FIRST(&ktr_todo); - STAILQ_REMOVE_HEAD(&ktr_todo, ktr_list); - KASSERT(req != NULL, ("got a NULL request")); - mtx_unlock(&ktrace_mtx); - /* - * It is not enough just to pass the cached cred - * to the VOP's in ktr_writerequest(). Some VFS - * operations use curthread->td_ucred, so we need - * to modify our thread's credentials as well. - * Evil. - */ - td->td_ucred = req->ktr_cred; - ktr_writerequest(req); - td->td_ucred = cred; - ktr_freerequest(req); - } -} - /* * MPSAFE */ @@ -344,7 +402,7 @@ req->ktr_header.ktr_len = buflen; req->ktr_buffer = buf; } - ktr_submitrequest(req); + ktr_submitrequest(curthread, req); } /* @@ -365,7 +423,24 @@ ktp->ktr_code = code; ktp->ktr_error = error; ktp->ktr_retval = retval; /* what about val2 ? */ - ktr_submitrequest(req); + ktr_submitrequest(curthread, req); +} + +/* + * When a system call returns, it may be necessary to drain ktrace records + * hung off the thread from an earlier context switch. Also necessary when + * a thread exits. + */ +void +ktrsysretdrain(void) +{ + struct thread *td = curthread; + + ktrace_enter(td); + sx_xlock(&ktrace_sx); + ktr_drain(td); + sx_xunlock(&ktrace_sx); + ktrace_exit(td); } void @@ -391,7 +466,7 @@ req->ktr_header.ktr_len = namelen; req->ktr_buffer = buf; } - ktr_submitrequest(req); + ktr_submitrequest(curthread, req); } /* @@ -439,7 +514,7 @@ ktg->ktr_rw = rw; req->ktr_header.ktr_len = datalen; req->ktr_buffer = buf; - ktr_submitrequest(req); + ktr_submitrequest(curthread, req); } void @@ -460,7 +535,7 @@ kp->action = action; kp->mask = *mask; kp->code = code; - ktr_submitrequest(req); + ktr_enqueuerequest(curthread, req); } void @@ -476,7 +551,7 @@ kc = &req->ktr_data.ktr_csw; kc->out = out; kc->user = user; - ktr_submitrequest(req); + ktr_enqueuerequest(curthread, req); } #endif /* KTRACE */ @@ -519,7 +594,7 @@ if (ops != KTROP_CLEARFILE && facs == 0) return (EINVAL); - td->td_pflags |= TDP_INKTRACE; + ktrace_enter(td); if (ops != KTROP_CLEAR) { /* * an operation which requires a file argument. @@ -530,7 +605,7 @@ error = vn_open(&nd, &flags, 0, -1); if (error) { mtx_unlock(&Giant); - td->td_pflags &= ~TDP_INKTRACE; + ktrace_exit(td); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); @@ -539,7 +614,7 @@ if (vp->v_type != VREG) { (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); mtx_unlock(&Giant); - td->td_pflags &= ~TDP_INKTRACE; + ktrace_exit(td); return (EACCES); } mtx_unlock(&Giant); @@ -647,7 +722,7 @@ (void) vn_close(vp, FWRITE, td->td_ucred, td); mtx_unlock(&Giant); } - td->td_pflags &= ~TDP_INKTRACE; + ktrace_exit(td); return (error); #else /* !KTRACE */ return (ENOSYS); @@ -688,7 +763,7 @@ } req->ktr_buffer = cp; req->ktr_header.ktr_len = uap->len; - ktr_submitrequest(req); + ktr_submitrequest(td, req); return (0); #else /* !KTRACE */ return (ENOSYS); @@ -787,12 +862,11 @@ } static void -ktr_writerequest(struct ktr_request *req) +ktr_writerequest(struct thread *td, struct ktr_request *req) { struct ktr_header *kth; struct vnode *vp; struct proc *p; - struct thread *td; struct ucred *cred; struct uio auio; struct iovec aiov[3]; @@ -800,18 +874,36 @@ int datalen, buflen, vrele_count; int error; - vp = req->ktr_vp; + /* + * We hold the vnode and credential for use in I/O in case ktrace is + * disabled on the process as we write out the request. + * + * XXXRW: This is not ideal: we could end up performing a write after + * the vnode has been closed. + */ + mtx_lock(&ktrace_mtx); + vp = td->td_proc->p_tracevp; + if (vp != NULL) + VREF(vp); + cred = td->td_proc->p_tracecred; + if (cred != NULL) + crhold(cred); + mtx_unlock(&ktrace_mtx); + /* * If vp is NULL, the vp has been cleared out from under this - * request, so just drop it. + * request, so just drop it. Make sure the credential and vnode are + * in sync: we should have both or neither. */ - if (vp == NULL) + if (vp == NULL) { + KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL")); return; + } + KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL")); + kth = &req->ktr_header; datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP]; buflen = kth->ktr_len; - cred = req->ktr_cred; - td = curthread; auio.uio_iov = &aiov[0]; auio.uio_offset = 0; auio.uio_segflg = UIO_SYSSPACE; @@ -835,6 +927,7 @@ auio.uio_resid += buflen; auio.uio_iovcnt++; } + mtx_lock(&Giant); vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); @@ -884,17 +977,12 @@ } } sx_sunlock(&allproc_lock); + /* - * Second, clear this vnode from any pending requests. + * We can't clear any pending requests in threads that have cached + * them but not yet committed them, as those are per-thread. The + * thread will have to clear it itself on system call return. */ - mtx_lock(&ktrace_mtx); - STAILQ_FOREACH(req, &ktr_todo, ktr_list) { - if (req->ktr_vp == vp) { - req->ktr_vp = NULL; - vrele_count++; - } - } - mtx_unlock(&ktrace_mtx); mtx_lock(&Giant); while (vrele_count-- > 0) vrele(vp); --- //depot/vendor/freebsd/src/sys/kern/kern_proc.c 2005/11/08 09:13:20 +++ //depot/user/rwatson/ktrace/src/sys/kern/kern_proc.c 2005/11/12 13:53:30 @@ -156,6 +156,7 @@ KASSERT((td != NULL), ("proc_dtor: bad thread pointer")); kg = FIRST_KSEGRP_IN_PROC(p); KASSERT((kg != NULL), ("proc_dtor: bad kg pointer")); + KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr")); #endif /* Dispose of an alternate kstack, if it exists. --- //depot/vendor/freebsd/src/sys/sys/ktrace.h 2005/11/01 14:50:28 +++ //depot/user/rwatson/ktrace/src/sys/sys/ktrace.h 2005/11/02 14:14:37 @@ -68,6 +68,7 @@ #define KTRCHECK(td, type) ((td)->td_proc->p_traceflag & (1 << type)) #define KTRPOINT(td, type) \ (KTRCHECK((td), (type)) && !((td)->td_pflags & TDP_INKTRACE)) +#define KTRCHECKDRAIN(td) (!(STAILQ_EMPTY(&(td)->td_ktr))) /* * ktrace record types @@ -174,6 +175,7 @@ void ktrgenio(int, enum uio_rw, struct uio *, int); void ktrsyscall(int, int narg, register_t args[]); void ktrsysret(int, int, register_t); +void ktrsysretdrain(void); #else --- //depot/vendor/freebsd/src/sys/sys/proc.h 2005/11/08 09:13:20 +++ //depot/user/rwatson/ktrace/src/sys/sys/proc.h 2005/11/12 13:53:30 @@ -608,6 +608,7 @@ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ struct p_sched *p_sched; /* (*) Scheduler-specific data. */ + STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ }; #define p_session p_pgrp->pg_session