Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC: deleted files #445

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ directories = "4"
sysinfo = "0.27"
ctrlc = "3.4"
chrono = "0.4"
procfs = { version = "0.17.0", default-features = false }

[target.'cfg(not(target_has_atomic = "64"))'.dependencies]
portable-atomic = "1.4"
Expand Down
156 changes: 154 additions & 2 deletions src/dir_walker.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
use std::cmp::Ordering;
use std::fs;
use std::fs::Metadata;
use std::os::linux::fs::MetadataExt;
use std::path;
use std::sync::Arc;
use std::sync::Mutex;

use crate::node::Node;
use crate::platform::InodeAndDevice;
use crate::progress::Operation;
use crate::progress::PAtomicInfo;
use crate::progress::RuntimeErrors;
use crate::progress::ORDERING;
use crate::utils::is_filtered_out_due_to_file_time;
use crate::utils::is_filtered_out_due_to_invert_regex;
use crate::utils::is_filtered_out_due_to_regex;
use procfs::process::FDTarget;
use rayon::iter::ParallelBridge;
use rayon::prelude::ParallelIterator;
use regex::Regex;
Expand Down Expand Up @@ -48,9 +53,44 @@
pub errors: Arc<Mutex<RuntimeErrors>>,
}

/// Return deleted file still accessed by a process by walking /proc/$PID/fd/$FD
/// Deleted files have nlinks == 0
fn get_deleted_files() -> Vec<(PathBuf, Metadata)> {
let mut deleted_files = Vec::new();

for p in procfs::process::all_processes().unwrap() {
let Ok(p) = p else {
continue;
};
let Ok(fds) = p.fd() else {
continue;
};

for fd in fds {
let Ok(fd) = fd else {
continue;
};

if let FDTarget::Path(path) = &fd.target {
let proc_fd = format!("/proc/{}/fd/{}", p.pid, fd.fd);
let Ok(metadata) = std::fs::metadata(&proc_fd) else {
continue;
};

if metadata.st_nlink() == 0 {
// TODO: remove " (deleted)", not part of actual name
deleted_files.push((path.clone(), metadata));
}
}
}
}

deleted_files
}

pub fn walk_it(dirs: HashSet<PathBuf>, walk_data: &WalkData) -> Vec<Node> {
let mut inodes = HashSet::new();
let top_level_nodes: Vec<_> = dirs
let mut top_level_nodes: Vec<_> = dirs
.into_iter()
.filter_map(|d| {
let prog_data = &walk_data.progress_data;
Expand All @@ -62,11 +102,123 @@
clean_inodes(node, &mut inodes, walk_data)
})
.collect();

// TODO: use a flag
let handle_deleted_files = true;

if handle_deleted_files {
let deleted_files: Vec<_> = get_deleted_files()
.into_iter()
.filter(|(_path, metadata)| {
let inode_and_device = (metadata.st_ino(), metadata.st_dev());
// ignore inodes already collected as part of regular files
!inodes.contains(&inode_and_device)
})
.collect();

// we try to insert deleted files in the node tree
for (path, m) in &deleted_files {
for mut top_level_node in &mut top_level_nodes {
// deleted files are always absolute, but not the files in the node tree
let absolute_path = path::absolute(&top_level_node.name).unwrap();
if path.starts_with(&absolute_path) {
insert_deleted_file_in_node_tree(
path.clone(),
m,
&mut top_level_node,

Check failure on line 128 in src/dir_walker.rs

View workflow job for this annotation

GitHub Actions / Style (ubuntu-latest)

this expression creates a reference which is immediately dereferenced by the compiler
&walk_data,

Check failure on line 129 in src/dir_walker.rs

View workflow job for this annotation

GitHub Actions / Style (ubuntu-latest)

this expression creates a reference which is immediately dereferenced by the compiler
0,
);
}
}

// Ignoring deleted file {:?} not child of any top_level_nodes
}
}

top_level_nodes
}

/// try to insert `path` in `root`, or its children
/// `path` is absolute
fn insert_deleted_file_in_node_tree(
path: PathBuf,
m: &Metadata,
root: &mut Node,
walk_data: &WalkData,
depth: usize,
) {
// TODO: filecount, filetime, regex...
let size = if walk_data.use_apparent_size {
m.st_size()
} else {
m.st_blocks() * 512
};

root.size += size;

if path
.parent()
.expect("path of deleted file return by kernel always has a parent")
== path::absolute(&root.name).unwrap()
{
// we found the node that represents the parent dir, create the deleted file as a new file

let node = Node {
name: path.clone(),
size,
children: vec![],
inode_device: Some((m.st_ino(), m.st_dev())),
depth,
};

root.children.push(node);
return;
}

// try to find the folder were the deleted file was
for child in &mut root.children {
if path.starts_with(path::absolute(&child.name).unwrap()) {
insert_deleted_file_in_node_tree(path, m, child, &walk_data, depth + 1);

Check failure on line 182 in src/dir_walker.rs

View workflow job for this annotation

GitHub Actions / Style (ubuntu-latest)

this expression creates a reference which is immediately dereferenced by the compiler
return;
}
}

// can't find a child to insert the file, we need to create a new folder
// a bit messy because we need to convert to/from absolute paths
let dir_name = path
.strip_prefix(path::absolute(&root.name).unwrap())
.unwrap()
.components()
.next()
.unwrap();
let absolute_dir_name = path::absolute(&root.name).unwrap().join(dir_name);

let new_folder = Node {
name: absolute_dir_name,
size: 0,
children: vec![],
inode_device: root.inode_device.map(|(_inode, device)| (0, device)), // keep the device, if we want to filter by device
depth: depth + 1,
};

root.children.push(new_folder);

insert_deleted_file_in_node_tree(
path,
m,
root.children.last_mut().unwrap(),
&walk_data,

Check failure on line 211 in src/dir_walker.rs

View workflow job for this annotation

GitHub Actions / Style (ubuntu-latest)

this expression creates a reference which is immediately dereferenced by the compiler
depth + 1,
);
}

// Remove files which have the same inode, we don't want to double count them.
fn clean_inodes(x: Node, inodes: &mut HashSet<(u64, u64)>, walk_data: &WalkData) -> Option<Node> {
fn clean_inodes(
x: Node,
inodes: &mut HashSet<InodeAndDevice>,
walk_data: &WalkData,
) -> Option<Node> {
if !walk_data.use_apparent_size {
if let Some(id) = x.inode_device {
if !inodes.insert(id) {
Expand Down
13 changes: 7 additions & 6 deletions src/node.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::dir_walker::WalkData;
use crate::platform::get_metadata;
use crate::platform::InodeAndDevice;
use crate::utils::is_filtered_out_due_to_file_time;
use crate::utils::is_filtered_out_due_to_invert_regex;
use crate::utils::is_filtered_out_due_to_regex;
Expand All @@ -12,7 +13,7 @@ pub struct Node {
pub name: PathBuf,
pub size: u64,
pub children: Vec<Node>,
pub inode_device: Option<(u64, u64)>,
pub inode_device: Option<InodeAndDevice>,
pub depth: usize,
}

Expand All @@ -25,7 +26,7 @@ pub enum FileTime {

#[allow(clippy::too_many_arguments)]
pub fn build_node(
dir: PathBuf,
path: PathBuf,
children: Vec<Node>,
is_symlink: bool,
is_file: bool,
Expand All @@ -37,15 +38,15 @@ pub fn build_node(
let by_filetime = &walk_data.by_filetime;

get_metadata(
&dir,
&path,
use_apparent_size,
walk_data.follow_links && is_symlink,
)
.map(|data| {
let inode_device = data.1;

let size = if is_filtered_out_due_to_regex(walk_data.filter_regex, &dir)
|| is_filtered_out_due_to_invert_regex(walk_data.invert_filter_regex, &dir)
let size = if is_filtered_out_due_to_regex(walk_data.filter_regex, &path)
|| is_filtered_out_due_to_invert_regex(walk_data.invert_filter_regex, &path)
|| by_filecount && !is_file
|| [
(&walk_data.filter_modified_time, data.2 .0),
Expand All @@ -71,7 +72,7 @@ pub fn build_node(
};

Node {
name: dir,
name: path,
size,
children,
inode_device,
Expand Down
2 changes: 1 addition & 1 deletion src/platform.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fn get_block_size() -> u64 {
512
}

type InodeAndDevice = (u64, u64);
pub(crate) type InodeAndDevice = (u64, u64);
type FileTime = (i64, i64, i64);

#[cfg(target_family = "unix")]
Expand Down
Loading