From 458e0ea576c9fa13e95e3d48c42f0b53ee276d4e Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 23 Feb 2024 11:44:56 +0000 Subject: [PATCH 1/2] Add page on optimization to user guide --- docs/images/optimization_map_fusion.svg | 100 +++++++++ docs/images/optimization_multiple_inputs.svg | 164 +++++++++++++++ ...timization_multiple_inputs_unoptimized.svg | 197 ++++++++++++++++++ docs/images/optimization_turned_off.svg | 133 ++++++++++++ docs/user-guide/index.md | 1 + docs/user-guide/optimization.md | 109 ++++++++++ 6 files changed, 704 insertions(+) create mode 100644 docs/images/optimization_map_fusion.svg create mode 100644 docs/images/optimization_multiple_inputs.svg create mode 100644 docs/images/optimization_multiple_inputs_unoptimized.svg create mode 100644 docs/images/optimization_turned_off.svg create mode 100644 docs/user-guide/optimization.md diff --git a/docs/images/optimization_map_fusion.svg b/docs/images/optimization_map_fusion.svg new file mode 100644 index 000000000..fb8a03db4 --- /dev/null +++ b/docs/images/optimization_map_fusion.svg @@ -0,0 +1,100 @@ + + + + + + + +num tasks: 5 +max projected memory: 100.0 MB +total nbytes: 36 bytes +optimized: True + + +op-001 + + +op-001 +asarray + + + + + +array-001 + + +array-001 +a + + + + + +op-001->array-001 + + + + + +op-003 + + +op-003 +astype +tasks: 4 + + + + + +array-001->op-003 + + + + + +array-003 + + +array-003 +c + + + + + +op-003->array-003 + + + + + +create-arrays + + +create-arrays +tasks: 1 + + + + + +arrays + + +arrays + + + + + +create-arrays->arrays + + + + + diff --git a/docs/images/optimization_multiple_inputs.svg b/docs/images/optimization_multiple_inputs.svg new file mode 100644 index 000000000..4566f0aef --- /dev/null +++ b/docs/images/optimization_multiple_inputs.svg @@ -0,0 +1,164 @@ + + + + + + + +num tasks: 5 +max projected memory: 100.0 MB +total nbytes: 72 bytes +optimized: True + + +op-001 + + +op-001 +ones + + + + + +array-001 + + +array-001 +a + + + + + +op-001->array-001 + + + + + +op-005 + + +op-005 +add +tasks: 4 + + + + + +array-001->op-005 + + + + + +op-002 + + +op-002 +ones + + + + + +array-002 + + +array-002 +b + + + + + +op-002->array-002 + + + + + +array-002->op-005 + + + + + +op-003 + + +op-003 +ones + + + + + +array-003 + + +array-003 +c + + + + + +op-003->array-003 + + + + + +array-003->op-005 + + + + + +array-005 + + +array-005 +e + + + + + +op-005->array-005 + + + + + +create-arrays + + +create-arrays +tasks: 1 + + + + + +arrays + + +arrays + + + + + +create-arrays->arrays + + + + + diff --git a/docs/images/optimization_multiple_inputs_unoptimized.svg b/docs/images/optimization_multiple_inputs_unoptimized.svg new file mode 100644 index 000000000..7c40fc18f --- /dev/null +++ b/docs/images/optimization_multiple_inputs_unoptimized.svg @@ -0,0 +1,197 @@ + + + + + + + +num tasks: 10 +max projected memory: 100.0 MB +total nbytes: 144 bytes +optimized: True + + +op-001 + + +op-001 +ones + + + + + +array-001 + + +array-001 +a + + + + + +op-001->array-001 + + + + + +op-005 + + +op-005 +add +tasks: 4 + + + + + +array-001->op-005 + + + + + +op-002 + + +op-002 +ones + + + + + +array-002 + + +array-002 +b + + + + + +op-002->array-002 + + + + + +op-004 + + +op-004 +add +tasks: 4 + + + + + +array-002->op-004 + + + + + +op-003 + + +op-003 +ones + + + + + +array-003 + + +array-003 +c + + + + + +op-003->array-003 + + + + + +array-003->op-004 + + + + + +array-004 + + +array-004 +d + + + + + +op-004->array-004 + + + + + +array-004->op-005 + + + + + +array-005 + + +array-005 +e + + + + + +op-005->array-005 + + + + + +create-arrays + + +create-arrays +tasks: 2 + + + + + +arrays + + +arrays + + + + + +create-arrays->arrays + + + + + diff --git a/docs/images/optimization_turned_off.svg b/docs/images/optimization_turned_off.svg new file mode 100644 index 000000000..3f83dae65 --- /dev/null +++ b/docs/images/optimization_turned_off.svg @@ -0,0 +1,133 @@ + + + + + + + +num tasks: 10 +max projected memory: 100.0 MB +total nbytes: 108 bytes +optimized: False + + +op-001 + + +op-001 +asarray + + + + + +array-001 + + +array-001 +a + + + + + +op-001->array-001 + + + + + +op-002 + + +op-002 +negative +tasks: 4 + + + + + +array-001->op-002 + + + + + +array-002 + + +array-002 +b + + + + + +op-002->array-002 + + + + + +op-003 + + +op-003 +astype +tasks: 4 + + + + + +array-002->op-003 + + + + + +array-003 + + +array-003 +c + + + + + +op-003->array-003 + + + + + +create-arrays + + +create-arrays +tasks: 2 + + + + + +arrays + + +arrays + + + + + +create-arrays->arrays + + + + + diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md index 22fe6a95e..fb0738e63 100644 --- a/docs/user-guide/index.md +++ b/docs/user-guide/index.md @@ -10,6 +10,7 @@ executors storage memory reliability +optimization scaling diagnostics ``` diff --git a/docs/user-guide/optimization.md b/docs/user-guide/optimization.md new file mode 100644 index 000000000..47090c325 --- /dev/null +++ b/docs/user-guide/optimization.md @@ -0,0 +1,109 @@ +# Optimization + +Cubed will automatically optimize the computation graph before running it. This can reduce the number of tasks in the plan, and the amount of intermediate IO, both of which speed up the computation. + +## Map fusion + +The simplest kind of optimization is _map fusion_, where operations that have one preceding operation with the same number of tasks are fused together. This optimization is enabled by default. + +You can see the effect of optimization before running any computation by using the `visualize` method on a Cubed array, such as in the following small example. We start by specifying `optimize_graph=False` to turn off optimization so we can see what the unoptimized plan looks like. + +```python +import cubed.array_api as xp + +a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2)) +b = xp.negative(a) +c = xp.astype(b, xp.float32) + +c.visualize("cubed-unoptimized", optimize_graph=False) +``` + +![Computation with optimization turned off](../images/optimization_turned_off.svg) + +Now we call `visualize` again, this time not setting `optimize_graph` so it picks up its default value of `True`. + +```python +c.visualize() +``` + +![Map fusion optimization](../images/optimization_map_fusion.svg) + +Note that with optimization turned on, the array `b` is no longer written as an intermediate output since it will be computed in the same tasks that compute array `c`. The overall number of tasks is reduced from 10 to 5, and the intermediate data (total nbytes) is reduced too. + +## Advanced optimization + +Cubed supports more powerful optimizations, but these are not (currently) enabled by default. This section explains what they are and how you can try them out. + +Map fusion is limited to the cases where there is a single preceding input with the same number of tasks. However, many computations have a branching structure, and the default optimization doesn't help in such cases. Here is an example, where even though optimization is enabled, it doesn't have any effect. + +```python +import cubed.array_api as xp + +a = xp.ones((3, 3), chunks=(2, 2)) +b = xp.ones((3, 3), chunks=(2, 2)) +c = xp.ones((3, 3), chunks=(2, 2)) +d = xp.add(b, c) +e = xp.add(a, d) + +e.visualize() +``` + +![Multiple inputs unoptimized](../images/optimization_multiple_inputs_unoptimized.svg) + +To overcome this limitation we can use the `optimize_function` argument, which allows us to specify the function that performs optimization. Here we use the `multiple_inputs_optimize_dag` function that can handle multiple inputs. + +```python +from cubed.core.optimization import multiple_inputs_optimize_dag + +e.visualize(optimize_function=multiple_inputs_optimize_dag) +``` + +![Multiple inputs unoptimized](../images/optimization_multiple_inputs.svg) + +Notice how the array `d` is fused away. + +The `multiple_inputs_optimize_dag` function has a couple of parameters to control how it works: + +`max_total_source_arrays` specifies the maximum number of source arrays that are allowed in the fused operation, in order to limit the number of reads that an individual task must perform. + +The default is 4. In the previous example above the fused operation has three source arrays (`a`, `b`, and `c`), which is below the maximum default allowed. On the other hand, a computation with a higher "fan-in" that exceeds the maximum will not be fused, or operations will be fused in stages. + +To change this setting, you can use Python's `functools.partial` as follows: + +```python +opt_fn = partial(multiple_inputs_optimize_dag, max_total_source_arrays=8) +visualize(optimize_function=opt_fn) +``` + +`max_total_num_input_blocks` specifies the maximum number of input blocks (chunks) that are allowed in the fused operation. + +Again, this is to limit the number of reads that an individual task must perform. The default is `None`, which means that operations are fused only if they have the same number of tasks. If set to an integer, then this limitation is removed, and tasks with a different number of tasks will be fused - as long as the total number of input blocks does not exceed the maximum. This setting is useful for reductions, and can be set using `functools.partial`: + +```python +opt_fn = partial(multiple_inputs_optimize_dag, max_total_num_input_blocks=10) +visualize(optimize_function=opt_fn) +``` + +## Debugging optimization + +For the advanced optimizations it can be difficult to understand why particular operations in a computation plan have been fused together - or more commonly, why they have *not* been fused. By enabling debug logging you can get detailed information from the optimize function to help you understand which operations are being fused - or not - and the reason in either case. + +Here's the previous example with logging enabled: + +```python +import logging + +logging.basicConfig(level=logging.DEBUG) + +e.visualize(optimize_function=multiple_inputs_optimize_dag) +``` + +The output explains which operations can or can't be fused, and why: + +``` +DEBUG:cubed.core.optimization:can't fuse op-001 since it is not fusable +DEBUG:cubed.core.optimization:can't fuse op-002 since it is not fusable +DEBUG:cubed.core.optimization:can't fuse op-003 since it is not fusable +DEBUG:cubed.core.optimization:can't fuse op-004 since no predecessor ops can be fused +DEBUG:cubed.primitive.blockwise:can fuse op-005 since num tasks of predecessor ops match +``` From e0a594bb362a2318536d2c485672499e635e408b Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 1 Oct 2024 12:48:10 +0100 Subject: [PATCH 2/2] Update optimization docs to reflect new default settings, and move advanced settings to the end --- docs/images/optimization_multiple_inputs.svg | 186 +++++++------- ...timization_multiple_inputs_unoptimized.svg | 228 +++++++++--------- docs/user-guide/optimization.md | 85 ++++--- 3 files changed, 255 insertions(+), 244 deletions(-) diff --git a/docs/images/optimization_multiple_inputs.svg b/docs/images/optimization_multiple_inputs.svg index 4566f0aef..b0da3e21f 100644 --- a/docs/images/optimization_multiple_inputs.svg +++ b/docs/images/optimization_multiple_inputs.svg @@ -1,147 +1,147 @@ - - - - -num tasks: 5 -max projected memory: 100.0 MB -total nbytes: 72 bytes -optimized: True - + + + +num tasks: 5 +max projected memory: 100.0 MB +total nbytes written: 72 bytes +optimized: True + -op-001 - - -op-001 -ones +op-004 + + +op-004 +ones - + -array-001 - - -array-001 -a +array-004 + + +array-004 +a - + -op-001->array-001 - - +op-004->array-004 + + - + -op-005 - - -op-005 -add -tasks: 4 +op-008 + + +op-008 +add +tasks: 4 - + -array-001->op-005 - - +array-004->op-008 + + - + -op-002 - - -op-002 -ones +op-005 + + +op-005 +ones - + -array-002 - - -array-002 -b +array-005 + + +array-005 +b - + -op-002->array-002 - - +op-005->array-005 + + - + -array-002->op-005 - - +array-005->op-008 + + - + -op-003 - - -op-003 -ones +op-006 + + +op-006 +ones - + -array-003 - - -array-003 -c +array-006 + + +array-006 +c - + -op-003->array-003 - - +op-006->array-006 + + - + -array-003->op-005 - - +array-006->op-008 + + - + -array-005 - - -array-005 -e +array-008 + + +array-008 +e - + -op-005->array-005 - - +op-008->array-008 + + create-arrays - -create-arrays -tasks: 1 + +create-arrays +tasks: 1 @@ -149,16 +149,16 @@ arrays - -arrays + +arrays create-arrays->arrays - - + + diff --git a/docs/images/optimization_multiple_inputs_unoptimized.svg b/docs/images/optimization_multiple_inputs_unoptimized.svg index 7c40fc18f..7eb2026e3 100644 --- a/docs/images/optimization_multiple_inputs_unoptimized.svg +++ b/docs/images/optimization_multiple_inputs_unoptimized.svg @@ -1,180 +1,180 @@ - - - - -num tasks: 10 -max projected memory: 100.0 MB -total nbytes: 144 bytes -optimized: True - + + + +num tasks: 10 +max projected memory: 100.0 MB +total nbytes written: 144 bytes +optimized: False + -op-001 - - -op-001 -ones +op-004 + + +op-004 +ones - + -array-001 - - -array-001 -a +array-004 + + +array-004 +a - + -op-001->array-001 - - +op-004->array-004 + + - + -op-005 - - -op-005 -add -tasks: 4 +op-008 + + +op-008 +add +tasks: 4 - + -array-001->op-005 - - +array-004->op-008 + + - + -op-002 - - -op-002 -ones +op-005 + + +op-005 +ones - + -array-002 - - -array-002 -b +array-005 + + +array-005 +b - + -op-002->array-002 - - +op-005->array-005 + + - + -op-004 - - -op-004 -add -tasks: 4 +op-007 + + +op-007 +add +tasks: 4 - + -array-002->op-004 - - +array-005->op-007 + + - + -op-003 - - -op-003 -ones +op-006 + + +op-006 +ones - + -array-003 - - -array-003 -c +array-006 + + +array-006 +c - + -op-003->array-003 - - +op-006->array-006 + + - + -array-003->op-004 - - +array-006->op-007 + + - + -array-004 - - -array-004 -d +array-007 + + +array-007 +d - + -op-004->array-004 - - +op-007->array-007 + + - + -array-004->op-005 - - +array-007->op-008 + + - + -array-005 - - -array-005 -e +array-008 + + +array-008 +e - + -op-005->array-005 - - +op-008->array-008 + + create-arrays - -create-arrays -tasks: 2 + +create-arrays +tasks: 2 @@ -182,16 +182,16 @@ arrays - -arrays + +arrays create-arrays->arrays - - + + diff --git a/docs/user-guide/optimization.md b/docs/user-guide/optimization.md index 47090c325..3844d5811 100644 --- a/docs/user-guide/optimization.md +++ b/docs/user-guide/optimization.md @@ -28,13 +28,17 @@ c.visualize() ![Map fusion optimization](../images/optimization_map_fusion.svg) -Note that with optimization turned on, the array `b` is no longer written as an intermediate output since it will be computed in the same tasks that compute array `c`. The overall number of tasks is reduced from 10 to 5, and the intermediate data (total nbytes) is reduced too. +Note that with optimization turned on, the array `b` is no longer written as an intermediate output since it will be computed in the same tasks that compute array `c`. The overall number of tasks is reduced from 10 to 5, and the intermediate data (total `nbytes`) is reduced too. -## Advanced optimization +Here we have just called `visualize` with the `optimize_graph` argument, but it's possible to use it when calling `compute` - which can be useful when debugging a computation. -Cubed supports more powerful optimizations, but these are not (currently) enabled by default. This section explains what they are and how you can try them out. +```python +c.compute(optimize_graph=False) +``` + +## Multiple-input fusion -Map fusion is limited to the cases where there is a single preceding input with the same number of tasks. However, many computations have a branching structure, and the default optimization doesn't help in such cases. Here is an example, where even though optimization is enabled, it doesn't have any effect. +Cubed supports more powerful optimizations, such as for when an array is created from multiple input arrays. Here is an example, shown first with optimization turned off. ```python import cubed.array_api as xp @@ -45,48 +49,24 @@ c = xp.ones((3, 3), chunks=(2, 2)) d = xp.add(b, c) e = xp.add(a, d) -e.visualize() +e.visualize("cubed-unoptimized", optimize_graph=False) ``` ![Multiple inputs unoptimized](../images/optimization_multiple_inputs_unoptimized.svg) -To overcome this limitation we can use the `optimize_function` argument, which allows us to specify the function that performs optimization. Here we use the `multiple_inputs_optimize_dag` function that can handle multiple inputs. +And with optimization turned on (the default): ```python -from cubed.core.optimization import multiple_inputs_optimize_dag - -e.visualize(optimize_function=multiple_inputs_optimize_dag) +e.visualize() ``` -![Multiple inputs unoptimized](../images/optimization_multiple_inputs.svg) +![Multiple inputs optimized](../images/optimization_multiple_inputs.svg) Notice how the array `d` is fused away. -The `multiple_inputs_optimize_dag` function has a couple of parameters to control how it works: - -`max_total_source_arrays` specifies the maximum number of source arrays that are allowed in the fused operation, in order to limit the number of reads that an individual task must perform. - -The default is 4. In the previous example above the fused operation has three source arrays (`a`, `b`, and `c`), which is below the maximum default allowed. On the other hand, a computation with a higher "fan-in" that exceeds the maximum will not be fused, or operations will be fused in stages. - -To change this setting, you can use Python's `functools.partial` as follows: - -```python -opt_fn = partial(multiple_inputs_optimize_dag, max_total_source_arrays=8) -visualize(optimize_function=opt_fn) -``` - -`max_total_num_input_blocks` specifies the maximum number of input blocks (chunks) that are allowed in the fused operation. - -Again, this is to limit the number of reads that an individual task must perform. The default is `None`, which means that operations are fused only if they have the same number of tasks. If set to an integer, then this limitation is removed, and tasks with a different number of tasks will be fused - as long as the total number of input blocks does not exceed the maximum. This setting is useful for reductions, and can be set using `functools.partial`: - -```python -opt_fn = partial(multiple_inputs_optimize_dag, max_total_num_input_blocks=10) -visualize(optimize_function=opt_fn) -``` - ## Debugging optimization -For the advanced optimizations it can be difficult to understand why particular operations in a computation plan have been fused together - or more commonly, why they have *not* been fused. By enabling debug logging you can get detailed information from the optimize function to help you understand which operations are being fused - or not - and the reason in either case. +Sometimes it can be difficult to understand why particular operations in a computation plan have been fused together - or more commonly, why they have *not* been fused. By enabling debug logging you can get detailed information from the optimize function to help you understand which operations are being fused - or not - and the reason in either case. Here's the previous example with logging enabled: @@ -95,15 +75,46 @@ import logging logging.basicConfig(level=logging.DEBUG) -e.visualize(optimize_function=multiple_inputs_optimize_dag) +e.visualize() ``` The output explains which operations can or can't be fused, and why: ``` -DEBUG:cubed.core.optimization:can't fuse op-001 since it is not fusable -DEBUG:cubed.core.optimization:can't fuse op-002 since it is not fusable -DEBUG:cubed.core.optimization:can't fuse op-003 since it is not fusable +DEBUG:cubed.core.optimization:can't fuse op-001 since it is not a primitive operation, or it uses map_direct +DEBUG:cubed.core.optimization:can't fuse op-002 since it is not a primitive operation, or it uses map_direct +DEBUG:cubed.core.optimization:can't fuse op-003 since it is not a primitive operation, or it uses map_direct DEBUG:cubed.core.optimization:can't fuse op-004 since no predecessor ops can be fused DEBUG:cubed.primitive.blockwise:can fuse op-005 since num tasks of predecessor ops match ``` + +## Advanced settings + +There are limits to how many input arrays and input chunks reads are fused together. These are imposed so that the number of reads that an individual task must perform is not excessive, which would otherwise result in slow running tasks. + +In some cases you may want to change these limits, which we look at here. + +### Total number of source arrays + +Cubed will not fuse operations that result in more than 4 source arrays in the fused operation. In the previous example above the fused operation has three source arrays (`a`, `b`, and `c`), which is below the maximum default allowed. On the other hand, a computation with a higher "fan-in" that exceeds the maximum will not be fused, or operations will be fused in stages. + +To change this, we have to specify the `optimize_function` that Cubed should use: `multiple_inputs_optimize_dag`. In addition, we use `fuctools.partial` to set the `max_total_source_arrays` argument to 8 as follows: + +```python +from functools import partial +from cubed.core.optimization import multiple_inputs_optimize_dag + +opt_fn = partial(multiple_inputs_optimize_dag, max_total_source_arrays=8) +e.visualize(optimize_function=opt_fn) +``` + +### Total number of input blocks + +The `max_total_num_input_blocks` argument to `multiple_inputs_optimize_dag` specifies the maximum number of input blocks (chunks) that are allowed in the fused operation. + +Again, this is to limit the number of reads that an individual task must perform. The default is `None`, which means that operations are fused only if they have the same number of tasks. If set to an integer, then this limitation is removed, and tasks with a different number of tasks will be fused - as long as the total number of input blocks does not exceed the maximum. This setting is useful for reductions, and can be set using `functools.partial`: + +```python +opt_fn = partial(multiple_inputs_optimize_dag, max_total_num_input_blocks=10) +e.visualize(optimize_function=opt_fn) +```