directory-traversal.tex

% https://ctan.um.ac.ir/macros/latex/contrib/beamer/doc/beameruserguide.pdf
\documentclass[17pt]{beamer}
\mode<presentation>{\usetheme{default} \setbeamercovered{transparent}}

\usepackage[english]{babel}
%\usepackage{times}
\usepackage{tgadventor}
\usepackage{graphicx}
\usepackage{hyperref}
%\usepackage[T1]{fontenc}
\usepackage[font=tiny]{caption}

% Customize the appearance of links
\hypersetup{
    colorlinks=true, % Enable colored links
    linkcolor=black, % Set the link color to black
    urlcolor=blue,   % Set the URL color (if needed)
%    citecolor=green  % Set the color for citations (if needed)
}

%------------------------------------------------------------------------------
% Title/Author/Organization/Date
%------------------------------------------------------------------------------

%\title[Haskell Streamly]{Directory Traversal: Haskell Outperforms Rust}
\title[Haskell Streamly]{Directory Traversal in Haskell}
\author{Harendra Kumar \texttt{harendra@composewell.com}}
\institute{Composewell Technologies}
\date{\small Functional Conf 2025}

\input{haskell-code}

%\AtBeginSubsection[]
%{
%  \begin{frame}<beamer>{Outline}
%    \tableofcontents[currentsection,currentsubsection]
%  \end{frame}
%}

%------------------------------------------------------------------------------
\begin{document}
%------------------------------------------------------------------------------

\begin{frame}
  \titlepage
\end{frame}

%\begin{frame}{Outline}
%  \tableofcontents [pausesections]
%\end{frame}

%------------------------------------------------------------------------------
\section{Motivation}
%------------------------------------------------------------------------------

\begin{frame}{Outline of the Talk}{Tree Traversals}
\begin{itemize}
    % This talk is also a case study in understanding and improving performance
    % characterstics of a Haskell program.
  \item A case study in performance improvement
  \item Directory traversal in Haskell
  \item Directory traversal in Haskell Streamly
  \item Tree traversal combinators in Streamly
  %\item Breadth first (BFS) vs depth first (DFS) traversals
  \item BFS vs DFS traversals
  \item Many ways to write tree traversals
  %\item Graph traversals
\end{itemize}
\end{frame}

\begin{frame}{Outline of the Talk}{Concurrency and Performance}
\begin{itemize}
  \item Concurrent tree traversal
  \item Comparing the performance % with ls, find and Rust's fd
  \item Why Haskell is faster than Rust?
    % Where else is it faster?
    % Can everything be faster in Haskell?
    % What is the right way to write fast Haskell programs?
    % Streamly is a framework to let you write it in the right way so that it
    % is automatically fast.
  \item Performance lessons and takeaways
\end{itemize}
\end{frame}

\begin{frame}[fragile]{Directory Traversal - Naive.hs}
{}
\tiny
\begin{minipage}{\textwidth}
\begin{code}
listDir :: FilePath -> IO ()
listDir dir = do
 contents <- listDirectory dir -- getdents syscall
 let fullPaths = map (dir </>) contents
 forM fullPaths $ \path -> do
  isDir <- doesDirectoryExist path -- stat syscall
  if isDir
  then do
   symlink <- pathIsSymbolicLink path -- lstat call
   if symlink
   then putStrLn path
   else listDir path -- depth first
  else putStrLn path
 putStrLn dir
\end{code}
\end{minipage}
\end{frame}

\begin{frame}{Benchmarking Setup}{}
\begin{itemize}
  \item Operating system: Linux
  \item Dataset: git repos with build artifacts
  \item total directories: $\approx 10K$
  \item total files : $\approx 50K$
  \item maximum depth: $\approx 20$ levels
  \item maximum breadth: $\approx 500$ files
  \item big directory test is missing
  \item Symbolic links not followed
  %\item BFS vs DFS
  % DFS/BFS compare with find using -depth option
\end{itemize}
\end{frame}

\begin{frame}{Benchmarking Parameters}{}
%\tiny

%Parameters to measure:
\begin{itemize}
  \item Wall clock time
  \item CPU time
  \item OS Memory (peak rss)
  \item Haskell allocations
\end{itemize}
\end{frame}

\begin{frame}{Benchmarking Tools}{}
%\tiny

Measurement tools:
\begin{itemize}
  \item timing: bash built-in `time'
  \item os memory: standalone `time -v'
  \item Haskell allocations: `+RTS -s'
  %\item output coloring is disabled for fd
\end{itemize}
\end{frame}

\begin{frame}[fragile]{Correctness and Fairness Check}{}
%\tiny

GNU find output:
\begin{code}
$ find .| wc
57257   57257 4095021
\end{code}

Naive Haskell program output:
\begin{code}
$ ghc -O2 Naive.hs
$ ./Naive | wc
57257   57257 4095021
\end{code}
\end{frame}

\begin{frame}[fragile]{GNU find vs Naive Haskell}{CPU time with hot caches}

%GNU find vs Naive Haskell - CPU time:
\begin{code}
$ time find . > /dev/null
real    0m0.184s
user    0m0.088s
sys     0m0.096s

$ time ./Naive > /dev/null
real    0m0.454s
user    0m0.291s
sys     0m0.164s

$ ./Naive +RTS -s > /dev/null
     458,801,424 bytes allocated in the heap

\end{code}
%  \item \$ time find -depth . > /dev/null
%  \item \$ time bfs -j1 -depth . > /dev/null
%  %\item \$ time fd -j1 -u --color never . > /dev/null
%  \item \$ time fd -j1 -u . > /dev/null
%\item \$ time ls -fR1A . > /dev/null
\scriptsize
Note: the wall clock time (real) $\approx$ cpu time (user + sys) because it is
a CPU bound test with hot caches.

\end{frame}

\begin{frame}{What is wrong with Naive?}{Performance issues}

Simple and idiomatic code but:
\begin{itemize}
  \item `listDirectory' buffers entire dir
  \item won't work with big directories
  \item `listDirectory' uses [Char] as paths
  \item Additional `stat' call for isDir check
  \item Additional `stat' call for symlink check
  \item One `putStrLn' call for each path
\end{itemize}
\end{frame}

\begin{frame}[fragile]{Naive.hs with OsPath}
{Replaces lists with arrays}
\tiny
\begin{minipage}{\textwidth}
\begin{code}
listDir :: OsPath -> IO ()
listDir dir = do
 contents <- listDirectory dir
 let fullPaths = map (dir </>) contents
 forM fullPaths $ \path -> do
     isDir <- doesDirectoryExist path
     if isDir
     then do
         symlink <- pathIsSymbolicLink path
         if symlink
         then putStrLn $ fromJust $ decodeUtf path
         else listDir path -- depth first
     else putStrLn $ fromJust $ decodeUtf path
 putStrLn $ fromJust $ decodeUtf dir
\end{code}
\end{minipage}
\end{frame}

\begin{frame}[fragile]{Naive.hs with OsPath Performance}
{Naive = 454ms, NaiveOsPath = 382ms}

\begin{code}
$ time ./NaiveOsPath > /dev/null
real    0m0.382s
user    0m0.176s
sys     0m0.207s

$ ./NaiveOsPath +RTS -s > /dev/null
     441,745,528 bytes allocated in the heap
\end{code}

\scriptsize
\begin{itemize}
\item 16\% (72/454) reduction in time  over original
\item 4\% (17/459) reduction in allocations
\end{itemize}
\end{frame}

%loopDir :: PosixString -> DirStream -> IO ()
    %resetErrno

        %errno <- getErrno
        %if (errno == eINTR)
        %then loopDir dirname dirp
        %else do
        %    let (Errno n) = errno
        %    if (n == 0)
        %    then closeDirStream dirp
        %    else throwErrno "loopDir"
\begin{frame}[fragile]{Naive.hs with DirStream}
{}
\begin{smallcode}
loopDir dirname dirp = do
    ptr <- c_readdir dirp -- no buffering
    if (ptr /= nullPtr)
    then do
        let dname = #{ptr struct dirent,d_name} ptr
        dtype <- #{peek struct dirent, d_type} ptr
        name <- peekFilePath (castPtr dname)
        let fn = dirname </> name
        if dtype == (#const DT_DIR)
        then do
            isMeta <- isMetaDir dname
            when (not isMeta) $ do
                putStrLn $ fromJust $ decodeUtf fn
                openDirStream fn >>= loopDir fn
        else putStrLn $ fromJust $ decodeUtf fn
        loopDir dirname dirp
    else do
      ...
\end{smallcode}
\end{frame}

\begin{frame}[fragile]{Naive with DirStream}
{NaiveOsPath = 382ms, NaiveDirStream = 250ms}

\begin{code}
$ time ./NaiveDirStream > /dev/null
real    0m0.250s
user    0m0.162s
sys     0m0.087s

$ ./NaiveDirStream +RTS -s > /dev/null
     386,655,384 bytes allocated in the heap
\end{code}

\scriptsize
\begin{itemize}
\item 35\% (132/382) reduction in time over NaiveOsPath
\item 13\% (56/442) reduction in allocations
\end{itemize}

\end{frame}

\begin{frame}{Issues with NaiveDirStream}{}

We fixed all perf issues we mentioned except one. Now we have:
\begin{itemize}
  \item One `putStrLn' call for each path
  \item Also, path to string for printing
  \item Code is low level vs NaiveOsPath
  \item Much better than C though
  \item Still slower than `find'
\end{itemize}

Can we do elegant as well as performant?
\end{frame}

\begin{frame}[fragile]{Reading a Directory as a Stream}
{Modular building block}

Reads a directory as an Either stream, Left is Dir and Right is File:
\begin{code}
readEitherPaths
  :: Path
  -> Stream m (Either Path Path)
\end{code}

But how to perform the recursion present in the previous code, elegantly?

\end{frame}

\begin{frame}[fragile]{Tree Traversal: Functional Style }{concatIterate}

Like `concatMap' but iterates over the output stream:
\begin{code}
concatIterate
  :: (a -> Stream m a) -> Stream m a -> Stream m a
\end{code}

\begin{itemize}
  \item `iterate' lifted to streaming
  \item a function is mapped over input stream
  \item mapped again over the output stream
  \item until the output stream is nil
\end{itemize}
\end{frame}

\begin{frame}[fragile]{Traversing Directory Tree}{Applying concatIterate}

For illustration, the type would look like:
\begin{code}
concatIterate
 :: (Either Dir File -> Stream m (Either Dir File))
 -> Stream m (Either Dir File)
 -> Stream m (Either Dir File)
\end{code}

\begin{itemize}
  \item Directories are Left, files are Right
  \item Generates Either child stream on Lefts
  \item Generates nil stream on Rights
  \item stream output is fed back to iterate
  %\item Iterates a function over all dirs
\end{itemize}
\end{frame}

%\begin{frame}[fragile]{DFS and BFS Traversal}{concatIterate variants}
%
%\begin{itemize}
%  \item concatIterateDfs: depth first
%  \item concatIterateBfs: breadth first
%\end{itemize}
%
%\end{frame}

\begin{frame}[fragile]{Depth First Style}{concatIterateWith}

Like `concatMapWith', uses a function to combine the generated output
streams in a depth first manner, forming a list-like structure.
\begin{code}
concatIterateWith
 :: (StreamK m a -> StreamK m a -> StreamK m a)
 -> (a -> StreamK m a)
 -> StreamK m a
 -> StreamK m a

concateIterate = concateIterateWith StreamK.append -- DFS
\end{code}
\end{frame}

\begin{frame}[fragile]{Breadth First Style}{mergeIterateWith}

Like `mergeMapWith', uses a function to combine the output streams
pairwise in a breadth-first manner, forming a tree-like structure.
\begin{code}
mergeIterateWith
 :: (StreamK m a -> StreamK m a -> StreamK m a)
 -> (a -> StreamK m a)
 -> StreamK m a
 -> StreamK m a

mergeIterate = mergeIterateWith StreamK.interleave -- BFS
\end{code}
\end{frame}

\begin{frame}[fragile]{Parallel Traversal}{parConcatIterate}

Parallelizes iteration over multiple elements of the stream:
\begin{code}
parConcatIterate :: MonadAsync m
 => (Config -> Config)
 -> (a -> Stream m a)
 -> Stream m a
 -> Stream m a
\end{code}

parMergeIterate is not implemented yet.

\end{frame}

\begin{frame}[fragile]{Parallel Traversal Variants}{parConcatIterate}

The first argument in `parConcatIterate' is concurrency config modifier, to specify concurrency
style, maxThreads, maxBuffer etc.
\begin{code}
Stream.parConcatIterate (Stream.interleaved True)
Stream.parConcatIterate (Stream.ordered True)
\end{code}

\end{frame}

\begin{frame}[fragile]{Graph Traversal}{concatIterateScan}

Use scan state to remember already traversed nodes.
\begin{code}
concatIterateScan :: Monad m 
  => (b -> a -> m b) 
  -> (b -> m (Maybe (b, Stream m a))) 
  -> b 
  -> Stream m a
\end{code}

\end{frame}

\begin{frame}[fragile]{Directory Iteration Function}
{}

Adapt `readEitherPaths' to work on an Either input, for dir (Left)
generate a dir stream, for right (File) generate a nil stream.
\begin{code}
f :: Either Path b -> Stream IO (Either Path Path)
f = either Dir.readEitherPaths (const Stream.nil)
\end{code}
\end{frame}

\begin{frame}[fragile]{Batched Output with Streamly}
{Composed Pipeline - serial}
  %$ toK StreamK.concatIterateWith StreamK.append f
\begin{code}
listDir :: IO ()
listDir = do
 Stream.fold (Handle.writeWith 32000 stdout)
  $ Stream.unfoldEachEndBy 10 Array.reader
  $ fmap (Path.toChunk . either id id)
  $ concatIterate f
  $ Stream.fromPure dir

 where

 f :: Either Path b -> Stream IO (Either Path Path)
 f = either Dir.readEitherPaths (const Stream.nil)

 dir = Left (fromJust $ Path.fromString ".")
\end{code}
\end{frame}

\begin{frame}[fragile]{Where are we?}
{NaiveDirStream = 250ms, BatchedStreamly = 184ms }

\begin{code}
$ time ./BatchedStreamly > /dev/null
real    0m0.184s
user    0m0.079s
sys     0m0.105s

$ ./ListDir +RTS -s > /dev/null
     161,668,280 bytes allocated in the heap
\end{code}

\scriptsize
\begin{itemize}
\item 26\% (66/250) reduction in time over NaiveDirStream
\item 58\% (225/386) reduction in allocations
\item performance is same as GNU find!
\item code is perfectly modular!
\item and we are not done yet!
\end{itemize}
\end{frame}

% listDir :: IO ()
%listDir = do
% Stream.fold (Handle.writeWith 32000 stdout)
%  $ Stream.unfoldEachEndBy 10 Array.reader
%  $ fmap Path.toChunk
%  $ Stream.unfoldEach Unfold.fromList
%  $ fmap (either id id)
%  $ toK StreamK.concatIterateWith StreamK.append f
%  $ Stream.fromPure dir
%
 %where

\begin{frame}[fragile]{Chunked Stream}
{readEitherChunks}

%Single path to single path stream:
%\begin{code}
%f :: Either Path b -> Stream IO (Either Path Path)
%f = either Dir.readEitherPaths (const Stream.nil)
%\end{code}

Path list input to path list output stream:
\begin{code}
f :: Either [Path] b
  -> Stream IO (Either [Path] [Path])
f = either Dir.readEitherChunks (const Stream.nil)

\end{code}
\end{frame}

\begin{frame}[fragile]{Chunked Stream Performance}
{BatchedStreamly = 184ms, ChunkedStreamly = 144ms}

\begin{code}
$ time ./ChunkedStreamly > /dev/null
real    0m0.144s
user    0m0.050s
sys     0m0.094s

$ ./ListDir +RTS -s > /dev/null
      32,387,736 bytes allocated in the heap
\end{code}

\scriptsize
\begin{itemize}
\item 22\% (40/184) reduction in time over BatchedStreamly
\item 80\% (129/161) reduction in allocations
\item better than the 184ms taken by GNU find!
\item code is perfectly modular!
\item and we are not done yet!
\end{itemize}
\end{frame}

\begin{frame}[fragile]{Byte Array Output}
{readEitherByteChunks}

Path list input and byte-array stream output:
\begin{code}
f :: Either [Path] b
  -> Stream IO (Either [Path] (Array Word8))
f = either
      Dir.readEitherByteChunks
      (const Stream.nil)
\end{code}
\end{frame}

\begin{frame}[fragile]{Byte Array Output Performance}
{ChunkedStreamly = 144ms, BufOutStreamly = 134ms}

\begin{code}
$ time ./BufOutStreamly > /dev/null
real    0m0.134s
user    0m0.041s
sys     0m0.093s

$ ./ListDir +RTS -s > /dev/null
      20,811,840 bytes allocated in the heap
\end{code}

\scriptsize
\begin{itemize}
\item 7\% (10/144) reduction in time over ChunkedStreamly
\item 37\% (12/32) reduction in allocations
\item fastest serial version yet
\item can be made better!
\end{itemize}
\end{frame}

\begin{frame}[fragile]{Concurrent Traversal}
{Composed Pipeline - concurrent}

%Serial stream iteration:
%\begin{code}
%StreamK.concatIterateWith StreamK.append f
%\end{code}

In the pipeline, just replace `concatIterate` by `parConcatIterate'.
%\begin{code}
%Stream.parConcatIterate id f
%\end{code}
\end{frame}

\begin{frame}[fragile]{Parallel Traversal Performance}{CPU time}

\begin{code}
$ time ./ParStreamly > /dev/null
real    0m0.085s
user    0m0.060s
sys     0m0.086s

$ ./ListDir +RTS -s > /dev/null
      18,861,616 bytes allocated in the heap
\end{code}

\begin{itemize}
\item 171\% CPU utilization (146/85)
\item that's all, we are done now
\end{itemize}
\end{frame}

\begin{frame}{Exercise}{Write Your Own}
\begin{itemize}
  \item Write this in your favorite lang
  \item Compare the performance
  \item Hint: Use GPT to do it quickly
\end{itemize}
\end{frame}

\begin{frame}{Comparative Performance}{Tools Compared}
\scriptsize

%\begin{itemize}
%  % https://unix.stackexchange.com/questions/279895/how-can-i-do-a-breadth-first-search-using-find
%  \item GNU find default order is neither dfs nor bfs
%  \item GNU find has -depth (DFS) option
%  \item GNU ls order is also neither dfs nor bfs
%  \item Rust fd traverses in dfs order
%  \item The `bfs' tool traverses in bfs order
%  % https://github.com/tavianator/bfs
%\end{itemize}

\begin{tabular}{|l|l|l|l|l|}
\hline
  & GNU find & bfs & fd -u & ListDir \\
\hline
  %Version & GNU ls 9.3 & GNU find 4.9.0 & fd 8.7.1 & bfs 3.0.4 \\
  Version & GNU find 4.9.0 & bfs 3.0.4 & fd 8.7.1 & streamly-0.11 \\
\hline
  Language & C & C & Rust & Haskell \\
\hline
  Search Order & Mixed,DFS & BFS,DFS & DFS & BFS,DFS \\
\hline
% Options used
\end{tabular}
\end{frame}

\begin{frame}[fragile]{Comparative Performance}{Functionality Check for Tools}

Ensure we are traversing same no. of dirs:
\begin{code}
  $ find .| wc
  57257   57257 4095021
  $ bfs .| wc
  57257   57257 4095021
  $ fd -u .| wc
  57256   57256 3990961
  $ ListDir | wc
  57256   57256 4095019
\end{code}
  %\item \$ ls -fR1A|wc \\
  %78165   67711 1964893
\end{frame}

\begin{frame}[fragile]{Comparative Performance}{Cases to benchmark}

\begin{itemize}
  \item case 1: IO Bound (with cold cache), clear the caches first:
\begin{itemize}
%\tiny
  %\item \$ cat /proc/sys/fs/dentry-state
  %\item \$ cat /proc/meminfo
  %\item \$ sync; echo 1 > /proc/sys/vm/drop_caches  % page cache
  %\item \$ sync; echo 2 > /proc/sys/vm/drop_caches % dentries, inodes
\item
\begin{code}
$ sync
$ echo 3 > /proc/sys/vm/drop_caches
\end{code}
  \item Run ls in the parent dir to fill required parent caches
\end{itemize}
  \item case 2: CPU Bound (with hot cache), run the command once before
  measuring, to warm the cache
  \item Serial vs Concurrent
  %\item DFS vs BFS
\end{itemize}

\end{frame}

\begin{frame}{CPU Bound - Serial}{Hot Cache}
\scriptsize

%\item CPU time and real time are very close indicates that there is no
%IO involved.

%& GNU ls & GNU find & Rust fd & bfs & ListDir \\
%Real time & 150 & 187 & 175 & 108 & 134 \\
%CPU time & 149 & 186 & 304 & 108 & 133 \\
%RSS & 3404 K & 2872 K & 26204 K & 3564 K & 14556 K\\
%\begin{tabular}{|l|l|l|l|l|}
%\hline
%  & GNU find & bfs & Rust fd & ListDir \\
%\hline
%  Real time & 187 & 112 & 175 & 134 \\
%\hline
%  CPU time & 186 & 111 & 304 & 133 \\
%\hline
%  CPU util & 99.5\% & 101.8\% & \color{red}{173.7\%} & 99.3\% \\
%\hline
%  Memory & 2.8 MiB & 3.5 MiB & 25.5 MiB & 14.2 MiB\\
%\hline
%\end{tabular}

\begin{tabular}{|l|l|l|l|l|}
\hline
  & bfs (C) & ListDir (Haskell) & GNU find (C) & fd (Rust) \\
\hline
  Real time & 112 ms & 134 ms & 187 ms & 175 ms \\
\hline
  CPU time & 111 ms & 133 ms & 186 ms & 304 ms \\
\hline
  CPU util & 101.8\% & 99.3\% & 99.5\% & \color{red}{173.7\%} \\
\hline
  Memory & 3.5 MiB & 14.2 MiB & 2.8 MiB & 25.5 MiB\\
\hline
\end{tabular}

Note: fd -j1 is not actually serial!

Note: the memory difference between Haskell and C is because of a constant
overhead due to Haskell RTS.

\end{frame}

%\begin{frame}{Benchmark Results}{Serial + BFS + CPU bound}
%\tiny
%
%\begin{tabular}{|l|l|l|}
%\hline
%  & bfs & ListDir \\
%\hline
%  Real time & 108 & 134 \\
%\hline
%  CPU time & 108 & 134 \\
%\hline
%  CPU util & 100\% & 100\% \\
%\hline
%  Memory & 3.5 MiB & 12.5 MiB\\
%\hline
%\end{tabular}
%\end{frame}

%\begin{frame}{Benchmark Results}{Concurrent + BFS + CPU bound}
%\tiny
%
%\begin{tabular}{|l|l|l|}
%\hline
%  & bfs & ListDir \\
%\hline
%  Real time & 126 & 134 \\
%\hline
%  CPU time & 216 & 134 \\
%\hline
%  CPU util & 171\% & 100\% \\
%\hline
%  Memory & 56.7 MiB & 12.5 MiB\\
%\hline
%\end{tabular}
%\end{frame}

\begin{frame}{CPU Bound - Concurrent}{Hot Cache}
\footnotesize

\begin{tabular}{|l|l|l|l|}
\hline
  & ListDir (Haskell) & bfs (C) & fd (Rust) \\
\hline
  Real time & 84 ms & 125 ms & 140 ms \\
\hline
  CPU time & 149 ms & 222 ms & 262 ms \\
\hline
  CPU util & 177\% & 177\% & 187\% \\
\hline
  Memory & 22.5 MiB & 56.7 MiB & 40.4 MiB \\
\hline
\end{tabular}
\end{frame}

\begin{frame}{IO Bound - Serial}{Cold Cache}
\footnotesize

\begin{tabular}{|l|l|l|l|l|}
\hline
  & find (C)
  & ListDir (Haskell)
  & bfs (C)
  & fd (Rust)
  \\
\hline
  Real time
  & 6984 ms
  & 6988 ms
  & 7616 ms
  & 7062 ms
  \\
\hline
  CPU time
  & 917 ms
  & 1033 ms
  & 887 ms
  & 1601 ms
  \\
\hline
  CPU util
  & 13.1\%
  & 14.8\%
  & 11.6\%
  & 22.7\%
  \\
\hline
  Memory
  & 2.6 MiB
  & 14.0 MiB
  & 3.5 MiB
  & 27.5 MiB
  \\
\hline
\end{tabular}

Note: `fd -j1' is likely not single threaded as we saw in CPU bounded case.

\end{frame}

\begin{frame}{IO Bound - Concurrent}{Cold Cache}
\footnotesize

\begin{tabular}{|l|l|l|l|}
\hline
  & ListDir (Haskell)
  & fd (Rust)
  & bfs (C)
  \\
\hline
  Real time
  & 5982 ms
  & 5987 ms
  & 6858 ms
  \\
\hline
  CPU time
  & 895 ms
  & 1370 ms
  & 1203 ms
  \\
\hline
  CPU util
  & 15\%
  & 22.9\%
  & 17.5\%
  \\
\hline
  Memory
  & 19.3 MiB
  & 39.6 MiB
  & 56.5 MiB
  \\
\hline
\end{tabular}

\end{frame}

\begin{frame}{Exercise}{Write fd using Streamly}
\begin{itemize}
  % call it fnd
  \item with all compatible options
  \item with both DFS and BFS modes
  \item will need a regex library
\end{itemize}
\end{frame}

\begin{frame}{Performance Lessons}{}
\begin{itemize}
\item Allocations are not very expensive
\item 25x allocs $\approx$ 2.5x cpu time
\item But they indicate other perf issues
\item Remove allocations in the fast path
\item Remove intermediate structures
\item Loop fusion is good at it
\item Reduce big loop iterations
\item Chunking is good at it
\item Code review from perf perspective
%  \item Carefully review your code and see which allocations are removable.
%  \item I do most of the optimizations by review only, not by measuring or
%    debugging.
%  \item Loop fusion is good at it
%  \item when we are dealing in big chunks fusion may not matter, the runs a
%    fewer times and does a lot more work. So what matters is that one big stage
%    and not the fusion of that stage with other stages. Trying to fuse it via a
%    fusible state machine will make matters worse.
%  \item thumbrule - for smaller loops fusion is important, for bigger loops
%    fusion makes it worse.
%  \item in a larger loop make the Skip as function calls
%  \item Copying data is almost nothing compared to the time it takes to run the
%    loop so many times with many allocations. Copying is almost free at times,
%    caring about it is useless unless we are talking about a large amount of
%    data or cache effects of small copying.
\end{itemize}
\end{frame}

%\begin{frame}{Modularity Pays}{Focus on important parts}
%\begin{itemize}
%  \item Focus is on the structure, not details
%  %\item Look at the outline of our example
%  %\item Focus is on iteration and compaction
%  \item Modularity from bottom to top
%  \item Program: composed building blocks
%\end{itemize}
%\end{frame}

\begin{frame}{Modularity Pays}{Bottom to Top Modularity}

Modularity turns the focus on the structure, not details.
Building blocks:
\begin{itemize}
  \item are well tested
  \item are well benchmarked
  \item compose well
  %\item Composing them does not change perf
  \item avoid ad-hoc, monolithic code
  %\item This ultimately pays off in keeping performance good
\end{itemize}
\end{frame}

\begin{frame}{Streamly Building Blocks}{Concurrency and Performance}

Streamly provides building blocks that are:
\begin{itemize}
  %\item Fearless concurrency
  %\item Implicit concurrency %, almost free % in Haskell Streamly
  \item Modular
  \item Composable
  \item Concurrent
  \item High-performance
  %\item Streamly lets you write high performance code in Haskell Streamly
\end{itemize}
\end{frame}

\begin{frame}{Haskell for Systems Programming}{}
\begin{itemize}
  \item As good as other systems languages
  \item Outperformed C and Rust above
  \item We need to use it correctly
  \item Perf needs careful review
  \item Modularity, refactoring - plus points
  \item Allows concise and high level code
  %\item Haskell can equal or outperform Rust % in real world applications
%  \item Can Haskell do as well as C or Rust or other systems programming
%    languages? The answer is an EMPHATIC yes. We have done this time and again
%    and with good modularity using idiomatic code. We have sort of perfected
%    this now using some rules and programming patterns.
%    There is a lot of framework level work still to be done to enable this in a
%    wide range of use cases, but we will get there. We can make Haskell great
%    again.
%  \item Haskell RealWorld\# $\geq$ Rust
%  \item Concise: quickly build high level code
%  \item Modular: fearless refactoring
\end{itemize}
\end{frame}

\begin{frame}{Source Code Links}
\begin{itemize}
\item
\href{https://github.com/composewell/fnconf-2025-directory-traversal}
  {this presentation}: and for basic Haskell example sources.
\item
  \href{https://github.com/composewell/streamly-examples/blob/aec56f4e4481e97330aa5ce4324cd1a2d2b3c2bd/examples/ListDir.hs}
    {streamly-examples repo}: for ListDir example source code.
\item \href{https://github.com/composewell/streamly}{streamly repo}:
  the perf improvements will be available in streamly-0.11 .
  \end{itemize}
\end{frame}

\begin{frame}{Acknowledgements}
  Thanks to Domen Kožar for sending a pull request to streamly that spurred
  more performance improvements.
\end{frame}

\begin{frame}{}
  \center{Thank You!}
  \center{harendra@composewell.com}
\end{frame}

%------------------------------------------------------------------------------
\end{document}
%------------------------------------------------------------------------------