From 04c4b9f4841ce9097f990ae836423d8b182733e7 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 22 Nov 2024 11:24:29 -0800 Subject: [PATCH] proc: invalidate task inodes when a task is destroyed PiperOrigin-RevId: 699233555 --- pkg/sentry/fsimpl/cgroupfs/cgroupfs.go | 1 + pkg/sentry/fsimpl/devpts/devpts.go | 1 + pkg/sentry/fsimpl/devpts/master.go | 1 + pkg/sentry/fsimpl/devpts/replica.go | 1 + pkg/sentry/fsimpl/fuse/inode.go | 1 + pkg/sentry/fsimpl/host/host.go | 1 + .../fsimpl/kernfs/dynamic_bytes_file.go | 1 + pkg/sentry/fsimpl/kernfs/filesystem.go | 43 ++++++++-------- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 13 +++++ pkg/sentry/fsimpl/kernfs/kernfs.go | 50 +++++++++++++++++++ pkg/sentry/fsimpl/kernfs/kernfs_test.go | 2 + pkg/sentry/fsimpl/kernfs/symlink.go | 1 + .../fsimpl/kernfs/synthetic_directory.go | 1 + pkg/sentry/fsimpl/mqfs/root.go | 1 + pkg/sentry/fsimpl/nsfs/nsfs.go | 1 + pkg/sentry/fsimpl/pipefs/pipefs.go | 1 + pkg/sentry/fsimpl/proc/BUILD | 1 + pkg/sentry/fsimpl/proc/filesystem_state.go | 23 +++++++++ pkg/sentry/fsimpl/proc/subtasks.go | 1 + pkg/sentry/fsimpl/proc/task.go | 10 ++++ pkg/sentry/fsimpl/proc/task_fds.go | 3 ++ pkg/sentry/fsimpl/proc/task_files.go | 6 +++ pkg/sentry/fsimpl/proc/tasks.go | 1 + pkg/sentry/fsimpl/proc/tasks_files.go | 2 + pkg/sentry/fsimpl/sockfs/sockfs.go | 1 + pkg/sentry/fsimpl/sys/kcov.go | 1 + pkg/sentry/fsimpl/sys/sys.go | 1 + pkg/sentry/kernel/task.go | 4 ++ pkg/sentry/kernel/task_exit.go | 42 ++++++++++++++++ pkg/sentry/kernel/task_start.go | 49 +++++++++--------- 30 files changed, 220 insertions(+), 45 deletions(-) create mode 100644 pkg/sentry/fsimpl/proc/filesystem_state.go diff --git a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go index 3bce26e90a..c291a369fd 100644 --- a/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go +++ b/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go @@ -511,6 +511,7 @@ type dir struct { kernfs.InodeNotSymlink kernfs.InodeWatches kernfs.OrderedChildren + kernfs.InodeFSOwned implStatFS locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index ed33420362..293351b490 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -229,6 +229,7 @@ type rootInode struct { kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. kernfs.InodeWatches kernfs.OrderedChildren + kernfs.InodeFSOwned rootInodeRefs locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 5bf13481f4..48ef3184ca 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -39,6 +39,7 @@ type masterInode struct { kernfs.InodeNotAnonymous kernfs.InodeNotDirectory kernfs.InodeNotSymlink + kernfs.InodeFSOwned kernfs.InodeWatches locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go index bc30831479..75e4954438 100644 --- a/pkg/sentry/fsimpl/devpts/replica.go +++ b/pkg/sentry/fsimpl/devpts/replica.go @@ -39,6 +39,7 @@ type replicaInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches + kernfs.InodeFSOwned locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/fuse/inode.go b/pkg/sentry/fsimpl/fuse/inode.go index 09c74a5b60..9f026372db 100644 --- a/pkg/sentry/fsimpl/fuse/inode.go +++ b/pkg/sentry/fsimpl/fuse/inode.go @@ -50,6 +50,7 @@ type inode struct { kernfs.InodeWatches kernfs.OrderedChildren kernfs.CachedMappable + kernfs.InodeFSOwned // the owning filesystem. fs is immutable. fs *filesystem diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 1b2cfd5e57..2aa0a74916 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -100,6 +100,7 @@ type inode struct { kernfs.InodeNotSymlink kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. kernfs.InodeWatches + kernfs.InodeFSOwned locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index cae0a81f4e..49770b554a 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -42,6 +42,7 @@ type DynamicBytesFile struct { InodeNotDirectory InodeNotSymlink InodeWatches + InodeFSOwned locks vfs.FileLocks // data can additionally implement vfs.WritableDynamicBytesSource to support diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 7caa2025d4..e8357901d5 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -109,7 +109,27 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir parent.dirMu.Lock() defer parent.dirMu.Unlock() // may be temporarily unlocked and re-locked below child := parent.children[name] - for child != nil { + for { + if child == nil { + // Dentry isn't cached; it either doesn't exist or failed revalidation. + // Attempt to resolve it via Lookup. + childInode, err := parent.inode.Lookup(ctx, name) + if err != nil { + return nil, err + } + var newChild Dentry + newChild.Init(fs, childInode) // childInode's ref is transferred to newChild. + parent.insertChildLocked(name, &newChild) + child = &newChild + + // Drop the ref on newChild. This will cause the dentry to get pruned + // from the dentry tree by the end of current filesystem operation + // (before returning to the VFS layer) if another ref is not picked on + // this dentry. + if !childInode.Keep() { + fs.deferDecRef(&newChild) + } + } // Cached dentry exists, revalidate. if child.inode.Valid(ctx, parent, name) { break @@ -120,26 +140,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir parent.dirMu.Lock() // Check for concurrent insertion of a new cached dentry. child = parent.children[name] - } - if child == nil { - // Dentry isn't cached; it either doesn't exist or failed revalidation. - // Attempt to resolve it via Lookup. - childInode, err := parent.inode.Lookup(ctx, name) - if err != nil { - return nil, err - } - var newChild Dentry - newChild.Init(fs, childInode) // childInode's ref is transferred to newChild. - parent.insertChildLocked(name, &newChild) - child = &newChild - - // Drop the ref on newChild. This will cause the dentry to get pruned - // from the dentry tree by the end of current filesystem operation - // (before returning to the VFS layer) if another ref is not picked on - // this dentry. - if !childInode.Keep() { - fs.deferDecRef(&newChild) - } + } return child, nil } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 3ec38cb82a..d26de52dd4 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -735,6 +735,7 @@ type StaticDirectory struct { InodeWatches OrderedChildren StaticDirectoryRefs + InodeFSOwned locks vfs.FileLocks fdOpts GenericDirectoryFDOptions @@ -845,3 +846,15 @@ type InodeNotAnonymous struct{} func (*InodeNotAnonymous) Anonymous() bool { return false } + +// InodeFSOwned represents inodes whose lifecycle is entirely managed by the +// filesystem. +// +// +stateify savable +type InodeFSOwned struct{} + +// AddInvalidateCallback implements Inode.AddInvalidateCallback. +func (*InodeFSOwned) AddInvalidateCallback(d *Dentry) {} + +// RemoveInvalidateCallback implements Remove.AddInvalidateCallback. +func (*InodeFSOwned) RemoveInvalidateCallback(d *Dentry) {} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index e197f64337..d37a1bc8a6 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -317,6 +317,42 @@ func (d *Dentry) decRefLocked(ctx context.Context) { } } +// Invalidate invalidates the dentry and its children. +func (d *Dentry) Invalidate(ctx context.Context) { + d.fs.mu.Lock() + defer d.fs.mu.Unlock() + if d.vfsd.IsDead() { + return + } + parent := d.parent.Load() + if parent == nil { + return + } + parent.dirMu.Lock() + if parent.vfsd.IsDead() { + parent.dirMu.Unlock() + return + } + child := parent.children[d.name] + if child != d { + parent.dirMu.Unlock() + return + } + delete(parent.children, d.name) + parent.dirMu.Unlock() + + children := []*Dentry{d} + for i := 0; i < len(children); i++ { + for _, c := range children[i].children { + children = append(children, c) + } + } + d.fs.invalidateRemovedChildLocked(ctx, d.fs.vfsfs.VirtualFilesystem(), child) + for _, c := range children { + c.evictLocked(ctx) + } +} + // cacheLocked should be called after d's reference count becomes 0. The ref // count check may happen before acquiring d.fs.mu so there might be a race // condition where the ref count is increased again by the time the caller @@ -392,6 +428,15 @@ func (d *Dentry) cacheLocked(ctx context.Context) { // back down to fs.opts.maxCachedDentries, so we don't loop. } +// DropDentryCache cleans up the dentry cache. +func (fs *Filesystem) DropDentryCache(ctx context.Context) { + fs.mu.Lock() + defer fs.mu.Unlock() + for !fs.cachedDentries.Empty() { + fs.cachedDentries.Back().evictLocked(ctx) + } +} + // Preconditions: // - fs.mu must be locked for writing. func (fs *Filesystem) evictCachedDentryLocked(ctx context.Context) { @@ -451,6 +496,7 @@ func (d *Dentry) destroy(ctx context.Context) { panic("dentry.destroy() called with references on the dentry") } + d.inode.RemoveInvalidateCallback(d) d.inode.DecRef(ctx) // IncRef from Init. refs.Unregister(d) @@ -505,6 +551,7 @@ func (d *Dentry) Init(fs *Filesystem, inode Inode) { d.flags = atomicbitops.FromUint32(d.flags.RacyLoad() | dflagsIsSymlink) } refs.Register(d) + inode.AddInvalidateCallback(d) } // VFSDentry returns the generic vfs dentry for this kernfs dentry. @@ -732,6 +779,9 @@ type Inode interface { // Anonymous indicates that the Inode is anonymous. It will never have // a name or parent. Anonymous() bool + + AddInvalidateCallback(d *Dentry) + RemoveInvalidateCallback(d *Dentry) } type inodeRefs interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index e67537bdaf..6e487e3dd6 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -109,6 +109,7 @@ type readonlyDir struct { kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren + kernfs.InodeFSOwned locks vfs.FileLocks } @@ -146,6 +147,7 @@ type dir struct { kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren + kernfs.InodeFSOwned locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 0c9a7b3a64..933da99d53 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -33,6 +33,7 @@ type StaticSymlink struct { InodeSymlink InodeNoStatFS InodeWatches + InodeFSOwned target string } diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go index 4b28181d5f..41700de43c 100644 --- a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go +++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go @@ -37,6 +37,7 @@ type syntheticDirectory struct { InodeWatches OrderedChildren syntheticDirectoryRefs + InodeFSOwned locks vfs.FileLocks } diff --git a/pkg/sentry/fsimpl/mqfs/root.go b/pkg/sentry/fsimpl/mqfs/root.go index 7c26e2219d..ecf01d5000 100644 --- a/pkg/sentry/fsimpl/mqfs/root.go +++ b/pkg/sentry/fsimpl/mqfs/root.go @@ -36,6 +36,7 @@ type rootInode struct { kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren + kernfs.InodeFSOwned locks vfs.FileLocks } diff --git a/pkg/sentry/fsimpl/nsfs/nsfs.go b/pkg/sentry/fsimpl/nsfs/nsfs.go index 0d5dc2531f..b4cb86820b 100644 --- a/pkg/sentry/fsimpl/nsfs/nsfs.go +++ b/pkg/sentry/fsimpl/nsfs/nsfs.go @@ -83,6 +83,7 @@ type Inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches + kernfs.InodeFSOwned inodeRefs locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 73e85654ff..b332fdbfbb 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -95,6 +95,7 @@ type inode struct { kernfs.InodeNotSymlink kernfs.InodeNoopRefCount kernfs.InodeWatches + kernfs.InodeFSOwned locks vfs.FileLocks pipe *pipe.VFSPipe diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 794779d92f..22a1f7889d 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -66,6 +66,7 @@ go_library( "fd_dir_inode_refs.go", "fd_info_dir_inode_refs.go", "filesystem.go", + "filesystem_state.go", "proc_impl.go", "subtasks.go", "subtasks_inode_refs.go", diff --git a/pkg/sentry/fsimpl/proc/filesystem_state.go b/pkg/sentry/fsimpl/proc/filesystem_state.go new file mode 100644 index 0000000000..4949733c09 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/filesystem_state.go @@ -0,0 +1,23 @@ +// Copyright 2024 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "gvisor.dev/gvisor/pkg/context" +) + +func (fs *filesystem) beforeSave() { + fs.DropDentryCache(context.Background()) +} diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 60e4dcb072..dab64758a3 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -40,6 +40,7 @@ type subtasksInode struct { kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren + kernfs.InodeFSOwned subtasksInodeRefs locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index d41a3c0013..e106d5c61c 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -113,6 +113,16 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns return inode, nil } +// AddInvalidateCallback implements kernfs.Inode.AddInvalidateCallback. +func (i *taskInode) AddInvalidateCallback(d *kernfs.Dentry) { + i.task.RegisterOnDestroyAction(d, d.Invalidate) +} + +// RemoveInvalidateCallback implements kernfs.Inode.AddInvalidateCallback. +func (i *taskInode) RemoveInvalidateCallback(d *kernfs.Dentry) { + i.task.UnregisterOnDestroyAction(d) +} + // Valid implements kernfs.Inode.Valid. This inode remains valid as long // as the task is still running. When it's dead, another tasks with the same // PID could replace it. diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 83996a4465..14ad5d0937 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -118,6 +118,7 @@ type fdDirInode struct { kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren + kernfs.InodeFSOwned } var _ kernfs.Inode = (*fdDirInode)(nil) @@ -202,6 +203,7 @@ type fdSymlink struct { kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches + kernfs.InodeFSOwned fs *filesystem task *kernel.Task @@ -264,6 +266,7 @@ type fdInfoDirInode struct { kernfs.InodeTemporary kernfs.InodeWatches kernfs.OrderedChildren + kernfs.InodeFSOwned } var _ kernfs.Inode = (*fdInfoDirInode)(nil) diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index c0d818b473..366d9476ad 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -422,6 +422,7 @@ type memInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches + kernfs.InodeFSOwned task *kernel.Task locks vfs.FileLocks @@ -743,6 +744,7 @@ type statusInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches + kernfs.InodeFSOwned task *kernel.Task pidns *kernel.PIDNamespace @@ -981,6 +983,7 @@ type exeSymlink struct { kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches + kernfs.InodeFSOwned fs *filesystem task *kernel.Task @@ -1054,6 +1057,7 @@ type cwdSymlink struct { kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches + kernfs.InodeFSOwned fs *filesystem task *kernel.Task @@ -1116,6 +1120,7 @@ type rootSymlink struct { kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches + kernfs.InodeFSOwned fs *filesystem task *kernel.Task @@ -1360,6 +1365,7 @@ type namespaceInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches + kernfs.InodeFSOwned locks vfs.FileLocks } diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index e4948791c0..68dac31605 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -46,6 +46,7 @@ type tasksInode struct { kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. kernfs.InodeWatches kernfs.OrderedChildren + kernfs.InodeFSOwned tasksInodeRefs locks vfs.FileLocks diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 97eef29178..1febbbc5b4 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -40,6 +40,7 @@ type selfSymlink struct { kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches + kernfs.InodeFSOwned pidns *kernel.PIDNamespace } @@ -83,6 +84,7 @@ type threadSelfSymlink struct { kernfs.InodeNotAnonymous kernfs.InodeSymlink kernfs.InodeWatches + kernfs.InodeFSOwned pidns *kernel.PIDNamespace } diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 4554814471..3e3f01cf22 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -100,6 +100,7 @@ type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches + kernfs.InodeFSOwned } // Open implements kernfs.Inode.Open. diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go index f713135435..1d6afcd95b 100644 --- a/pkg/sentry/fsimpl/sys/kcov.go +++ b/pkg/sentry/fsimpl/sys/kcov.go @@ -43,6 +43,7 @@ type kcovInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink kernfs.InodeWatches + kernfs.InodeFSOwned implStatFS } diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 2c3efd0af9..cffb0bfafc 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -383,6 +383,7 @@ type dir struct { kernfs.InodeNotSymlink kernfs.InodeTemporary kernfs.InodeWatches + kernfs.InodeFSOwned kernfs.OrderedChildren locks vfs.FileLocks diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 0c65b18289..993e0129b4 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/metric" @@ -630,6 +631,9 @@ type Task struct { // Origin is the origin of the task. Origin TaskOrigin + + // +checklocks:mu + onDestroyAction map[any]func(ctx context.Context) `state:"nosave"` } // Task related metrics diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 0e8ac8067c..8fb41990da 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -30,6 +30,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -756,9 +757,50 @@ func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { // Do not clear t.parent. It may be still be needed after the task has exited // (for example, to perform ptrace access checks on /proc/[pid] files). } + t.execOnDestroyActions() } } +// RegisterOnDestroyAction registers a callback to be executed when the task +// is destroyed. +// +// The 'key' parameter provides a way to identify and unregister the action. +// +// Note: These actions are not preserved across S/R. +func (t *Task) RegisterOnDestroyAction(key any, act func(ctx context.Context)) { + t.mu.Lock() + defer t.mu.Unlock() + if t.onDestroyAction == nil { + return + } + t.onDestroyAction[key] = act +} + +// UnregisterOnDestroyAction unregisters a function previously registered with +// RegisterOnDestroyAction using the same key. +func (t *Task) UnregisterOnDestroyAction(key any) { + t.mu.Lock() + defer t.mu.Unlock() + delete(t.onDestroyAction, key) +} + +func (t *Task) execOnDestroyActions() { + t.mu.Lock() + actions := t.onDestroyAction + t.onDestroyAction = nil + t.mu.Unlock() + + if len(actions) == 0 { + return + } + // Run in another goroutine to avoid extra lock dependencies. + go func() { + for _, act := range actions { + act(t) + } + }() +} + // Preconditions: The TaskSet mutex must be locked. func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.SignalInfo { info := &linux.SignalInfo{ diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index ded77133f7..8a4924c17e 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -151,30 +151,31 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error) parent: cfg.Parent, children: make(map[*Task]struct{}), }, - runState: (*runApp)(nil), - interruptChan: make(chan struct{}, 1), - signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)), - signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, - image: *image, - fsContext: cfg.FSContext, - fdTable: cfg.FDTable, - k: cfg.Kernel, - ptraceTracees: make(map[*Task]struct{}), - allowedCPUMask: cfg.AllowedCPUMask.Copy(), - ioUsage: &usage.IO{}, - niceness: cfg.Niceness, - utsns: cfg.UTSNamespace, - ipcns: cfg.IPCNamespace, - mountNamespace: cfg.MountNamespace, - rseqCPU: -1, - rseqAddr: cfg.RSeqAddr, - rseqSignature: cfg.RSeqSignature, - futexWaiter: futex.NewWaiter(), - containerID: cfg.ContainerID, - cgroups: make(map[Cgroup]struct{}), - userCounters: cfg.UserCounters, - sessionKeyring: cfg.SessionKeyring, - Origin: cfg.Origin, + runState: (*runApp)(nil), + interruptChan: make(chan struct{}, 1), + signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)), + signalStack: linux.SignalStack{Flags: linux.SS_DISABLE}, + image: *image, + fsContext: cfg.FSContext, + fdTable: cfg.FDTable, + k: cfg.Kernel, + ptraceTracees: make(map[*Task]struct{}), + allowedCPUMask: cfg.AllowedCPUMask.Copy(), + ioUsage: &usage.IO{}, + niceness: cfg.Niceness, + utsns: cfg.UTSNamespace, + ipcns: cfg.IPCNamespace, + mountNamespace: cfg.MountNamespace, + rseqCPU: -1, + rseqAddr: cfg.RSeqAddr, + rseqSignature: cfg.RSeqSignature, + futexWaiter: futex.NewWaiter(), + containerID: cfg.ContainerID, + cgroups: make(map[Cgroup]struct{}), + userCounters: cfg.UserCounters, + sessionKeyring: cfg.SessionKeyring, + Origin: cfg.Origin, + onDestroyAction: make(map[any]func(ctx context.Context)), } t.netns = cfg.NetworkNamespace t.creds.Store(cfg.Credentials)