From fac58aa217f7b4c4d5327b201f5e9fe8cbf9ee6a Mon Sep 17 00:00:00 2001
From: "Siang-Yun (Sonia) Lee" <siang-yun.lee@epfl.ch>
Date: Fri, 30 Jun 2023 15:36:19 +0200
Subject: [PATCH] Realistic AQFP technology constraints (#614)

* missing includes

* realistic assumptions & updated verification

* modifications from unmerged PR #594

* update basic ASAP ALAP scheduling

* update counting & dump

* update chunk movement

* compile & debug

* debug chunk movement

* clean up depth-optimal scheduling

* visualization

* move PIs and POs, debug

* remove buffer chains; clean up

* cleanup

* fix merging mistakes

* fix retiming

* fix tests

* delete deprecated verification method

* fix experiments
---
 experiments/aqfp_flow_aspdac.cpp              |   22 +-
 experiments/aqfp_flow_date.cpp                |    5 +-
 experiments/buffer_insertion.cpp              |   93 +-
 experiments/buffer_insertion_iwls.cpp         |  101 --
 experiments/buffer_insertion_iwls.json        |  160 --
 .../algorithms/aqfp/aqfp_assumptions.hpp      |   86 +-
 .../algorithms/aqfp/aqfp_rebuild.hpp          |   29 +-
 .../algorithms/aqfp/aqfp_retiming.hpp         |    2 +-
 .../algorithms/aqfp/buffer_insertion.hpp      | 1550 +++++++++--------
 .../algorithms/aqfp/buffer_verification.hpp   |  192 +-
 .../aqfp/optimal_buffer_insertion.hpp         |   85 +
 include/mockturtle/io/write_dot.hpp           |   22 +
 include/mockturtle/mockturtle.hpp             |    2 +
 include/mockturtle/networks/buffered.hpp      |   90 +-
 test/algorithms/aqfp/aqfp_retiming.cpp        |    7 +-
 test/algorithms/aqfp/buffer_insertion.cpp     |  328 +---
 16 files changed, 1326 insertions(+), 1448 deletions(-)
 delete mode 100644 experiments/buffer_insertion_iwls.cpp
 delete mode 100644 experiments/buffer_insertion_iwls.json

diff --git a/experiments/aqfp_flow_aspdac.cpp b/experiments/aqfp_flow_aspdac.cpp
index cd713694e..9385c219b 100644
--- a/experiments/aqfp_flow_aspdac.cpp
+++ b/experiments/aqfp_flow_aspdac.cpp
@@ -91,15 +91,18 @@ int main()
     /* convert MIG network to AQFP */
     aqfp_network aqfp = cleanup_dangling<mig_network, aqfp_network>( mig_opt );
 
+    aqfp_assumptions_legacy aqfp_ps;
+    aqfp_ps.splitter_capacity = 4;
+    aqfp_ps.branch_pis = true;
+    aqfp_ps.balance_pis = true;
+    aqfp_ps.balance_pos = true;
+
     /* Buffer insertion params */
     buffer_insertion_params buf_ps;
     buf_ps.scheduling = buffer_insertion_params::better_depth;
     buf_ps.optimization_effort = buffer_insertion_params::none;
     buf_ps.max_chunk_size = 100;
-    buf_ps.assume.splitter_capacity = 4u;
-    buf_ps.assume.branch_pis = true;
-    buf_ps.assume.balance_pis = true;
-    buf_ps.assume.balance_pos = true;
+    buf_ps.assume = legacy_to_realistic( aqfp_ps );
 
     /* buffer insertion */
     stopwatch<>::duration time_insertion{ 0 };
@@ -110,12 +113,6 @@ int main()
     uint32_t jj_depth = buf_inst.depth();
     total_runtime += to_seconds( time_insertion );
 
-    aqfp_assumptions aqfp_ps;
-    aqfp_ps.splitter_capacity = buf_ps.assume.splitter_capacity;
-    aqfp_ps.branch_pis = buf_ps.assume.branch_pis;
-    aqfp_ps.balance_pis = buf_ps.assume.balance_pis;
-    aqfp_ps.balance_pos = buf_ps.assume.balance_pos;
-
     /* retiming params */
     aqfp_retiming_params aps;
     aps.aqfp_assumptions_ps = aqfp_ps;
@@ -169,7 +166,10 @@ int main()
 
     /* cec */
     auto cec = abc_cec( buffered_aqfp, benchmark );
-    cec &= verify_aqfp_buffer( buffered_aqfp, aqfp_ps );
+    std::vector<uint32_t> pi_levels;
+    for ( auto i = 0u; i < buffered_aqfp.num_pis(); ++i )
+      pi_levels.emplace_back( 0 );
+    cec &= verify_aqfp_buffer( buffered_aqfp, aqfp_ps, pi_levels );
 
     /* compute final JJ cost */
     uint32_t num_jjs_ret = 0;
diff --git a/experiments/aqfp_flow_date.cpp b/experiments/aqfp_flow_date.cpp
index 1a8267ef1..a2f03c634 100644
--- a/experiments/aqfp_flow_date.cpp
+++ b/experiments/aqfp_flow_date.cpp
@@ -401,9 +401,8 @@ int main( int argc, char** argv )
     buf_ps.optimization_effort = buffer_insertion_params::until_sat;
     buf_ps.max_chunk_size = std::numeric_limits<uint32_t>::max();
     buf_ps.assume.splitter_capacity = 4u;
-    buf_ps.assume.branch_pis = false;
-    buf_ps.assume.balance_pis = false;
-    buf_ps.assume.balance_pos = true;
+    buf_ps.assume.ci_capacity = std::numeric_limits<uint32_t>::max();
+    buf_ps.assume.balance_cios = true;
     buffer_insertion buf_inst( aqfp, buf_ps );
     uint32_t num_bufs = buf_inst.dry_run();
     uint32_t num_jjs = opt_stats.maj3_after_exact * 6 + opt_stats.maj5_after_exact * 10 + num_bufs * 2;
diff --git a/experiments/buffer_insertion.cpp b/experiments/buffer_insertion.cpp
index 86545ef40..c83bb24d2 100644
--- a/experiments/buffer_insertion.cpp
+++ b/experiments/buffer_insertion.cpp
@@ -23,18 +23,13 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 #include "experiments.hpp"
-#include <lorina/aiger.hpp>
 #include <lorina/diagnostics.hpp>
 #include <lorina/verilog.hpp>
 #include <mockturtle/algorithms/aqfp/buffer_insertion.hpp>
 #include <mockturtle/algorithms/aqfp/buffer_verification.hpp>
-#include <mockturtle/algorithms/cleanup.hpp>
-#include <mockturtle/algorithms/mapper.hpp>
-#include <mockturtle/algorithms/node_resynthesis/mig_npn.hpp>
-#include <mockturtle/io/aiger_reader.hpp>
 #include <mockturtle/io/verilog_reader.hpp>
 #include <mockturtle/io/write_verilog.hpp>
-#include <mockturtle/networks/aig.hpp>
+#include <mockturtle/io/write_dot.hpp>
 #include <mockturtle/networks/buffered.hpp>
 #include <mockturtle/networks/mig.hpp>
 #include <mockturtle/utils/name_utils.hpp>
@@ -44,6 +39,8 @@
 
 #include <iostream>
 
+using namespace mockturtle;
+
 int main( int argc, char* argv[] )
 {
   std::string run_only_one = "";
@@ -59,9 +56,9 @@ int main( int argc, char* argv[] )
   /* NOTE 2: Please clone this repository: https://github.com/lsils/SCE-benchmarks
    * And put in the following string the relative path from your build path to SCE-benchmarks/ISCAS/strashed/
    */
-  std::string benchmark_path = "../../SCE-benchmarks/ISCAS/strashed/";
+  // std::string benchmark_path = "../../SCE-benchmarks/ISCAS/strashed/";
   // std::string benchmark_path = "../../SCE-benchmarks/MCNC/original/";
-  // std::string benchmark_path = "../../SCE-benchmarks/EPFL/MIGs/";
+   std::string benchmark_path = "../../SCE-benchmarks/EPFL/MIGs/";
   static const std::string benchmarks_iscas[] = {
       "adder1", "adder8", "mult8", "counter16", "counter32", "counter64", "counter128",
       "c17", "c432", "c499", "c880", "c1355", "c1908", "c2670", "c3540", "c5315", "c6288", "c7552",
@@ -71,33 +68,30 @@ int main( int argc, char* argv[] )
       "m3", "max512", "misex3", "mlp4", "prom2", "sqr6", "x1dn" };
   const auto benchmarks_epfl = experiments::epfl_benchmarks();
 
-  experiment<std::string, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, float, bool>
-      exp( "buffer_insertion", "benchmark", "#gates", "depth", "max FO", "#buffers", "opt. #JJs", "depth_JJ", "runtime", "verified" );
+  experiment<std::string, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, float, bool>
+      exp( "buffer_insertion", "benchmark", "#gates", "#buffers", "#buff real", "max phase skip", "depth_JJ", "runtime", "verified" );
 
   buffer_insertion_params ps;
-  ps.scheduling = buffer_insertion_params::better;
-  ps.optimization_effort = buffer_insertion_params::until_sat;
-  ps.assume.splitter_capacity = 4u;
-  ps.assume.branch_pis = true;
-  ps.assume.balance_pis = true;
-  ps.assume.balance_pos = true;
-
-  if ( argc == 3 ) // example syntax: ./buffer_insertion 4 111
-  {
-    ps.assume.splitter_capacity = std::stoi( argv[1] );
-    uint32_t arg = std::stoi( argv[2] );
-    ps.assume.branch_pis = arg >= 100;
-    ps.assume.balance_pis = ( arg % 100 ) >= 10;
-    ps.assume.balance_pos = arg % 10;
-  }
+  ps.scheduling = buffer_insertion_params::better_depth;
+  ps.optimization_effort = buffer_insertion_params::none;
+  ps.max_chunk_size = 10000;
+
+  // ASP-DAC etc. SoTA works
+  //ps.assume.num_phases = 1;
+  //ps.assume.ci_phases = {0u};
+  //ps.assume.ci_capacity = 1;
+  //ps.assume.splitter_capacity = 4;
+  //ps.assume.balance_cios = true;
+  
+  // best possible relaxation
+  ps.assume.ci_capacity = 2;
+  ps.assume.ci_phases = { 3u, 4u, 5u };
 
   uint32_t total_buffers{ 0 }, total_depth{ 0 };
-  for ( auto benchmark : benchmarks_iscas )
+  for ( auto benchmark : benchmarks_epfl )
   {
     if ( run_only_one != "" && benchmark != run_only_one )
       continue;
-    if ( benchmark == "hyp" && run_only_one != "hyp" )
-      continue;
     std::cout << "\n[i] processing " << benchmark << "\n";
 
     names_view<mig_network> ntk;
@@ -114,21 +108,28 @@ int main( int argc, char* argv[] )
     stopwatch<>::duration t{ 0 };
     buffer_insertion aqfp( ntk, ps );
     buffered_mig_network bufntk;
+    std::vector<uint32_t> pi_levels( ntk.num_pis() );
     uint32_t num_buffers = call_with_stopwatch( t, [&]() {
-      return aqfp.dry_run();
+      return aqfp.run( bufntk, pi_levels );
     } );
-    aqfp.dump_buffered_network( bufntk );
-    bool verified = verify_aqfp_buffer( bufntk, ps.assume );
+    bool verified = verify_aqfp_buffer( bufntk, ps.assume, pi_levels );
+    auto const levels = schedule_buffered_network_with_PI_levels( bufntk, pi_levels );
+
+    uint32_t max_chain = aqfp.remove_buffer_chains( bufntk );
 
     // names_view named_bufntk{bufntk};
     // restore_pio_names_by_order( ntk, named_bufntk );
     // write_verilog( named_bufntk, benchmark_path + "../best_insertion/" + benchmark + "_buffered.v" );
 
-    depth_view d{ ntk };
-    depth_view d_buf{ bufntk };
+#if 0
+    depth_view<buffered_mig_network> depth_buffered( bufntk );
+    depth_buffered.foreach_node( [&]( auto n ){ depth_buffered.set_level( n, levels[n] ); } );
+    write_dot( depth_buffered, benchmark + ".dot" );
+    std::system( fmt::format( "dot -Tpng -o {0}.png {0}.dot; rm {0}.dot; open {0}.png", benchmark ).c_str() );
+#endif
 
     total_buffers += num_buffers;
-    total_depth += d_buf.depth();
+    total_depth += aqfp.depth();
 
     uint32_t max_fanout{ 0 };
     ntk.foreach_node( [&]( auto const& n ) {
@@ -136,7 +137,29 @@ int main( int argc, char* argv[] )
         max_fanout = std::max( max_fanout, ntk.fanout_size( n ) );
     } );
 
-    exp( benchmark, ntk.num_gates(), d.depth(), max_fanout, num_buffers, ntk.num_gates() * 6 + num_buffers * 2, d_buf.depth(), to_seconds( t ), verified );
+    uint32_t num_buffers_real{0}, max_phase_skip{0};
+
+    bufntk.foreach_node( [&]( auto n ){
+      if ( bufntk.is_buf( n ) && !bufntk.is_dead( n ) )
+        num_buffers_real++;
+    });
+    max_phase_skip = max_chain;
+    for ( auto pil : pi_levels )
+    {
+      if ( pil % 4 == 1 )
+        max_phase_skip = std::max( max_phase_skip,  pil - 5 );
+      else if ( pil % 4 == 0 )
+        max_phase_skip = std::max( max_phase_skip,  pil - 4 );
+      else if ( pil % 4 == 3 )
+        max_phase_skip = std::max( max_phase_skip,  pil - 3 );
+      else
+        fmt::print( "strange pi level {}\n", pil );
+    }
+    bufntk.foreach_po( [&]( auto f ){
+      max_phase_skip = std::max( max_phase_skip, aqfp.depth() - levels[f] );
+    });
+
+    exp( benchmark, ntk.num_gates(), num_buffers, num_buffers_real, max_phase_skip, aqfp.depth(), to_seconds( t ), verified );
   }
 
   exp.save();
diff --git a/experiments/buffer_insertion_iwls.cpp b/experiments/buffer_insertion_iwls.cpp
deleted file mode 100644
index 56bc21117..000000000
--- a/experiments/buffer_insertion_iwls.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* mockturtle: C++ logic network library
- * Copyright (C) 2018-2022  EPFL
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "experiments.hpp"
-#include <mockturtle/algorithms/aqfp/buffer_insertion.hpp>
-#include <mockturtle/algorithms/aqfp/buffer_verification.hpp>
-#include <mockturtle/io/verilog_reader.hpp>
-#include <mockturtle/networks/buffered.hpp>
-#include <mockturtle/networks/mig.hpp>
-#include <mockturtle/views/depth_view.hpp>
-
-#include <algorithm>
-#include <fmt/format.h>
-#include <lorina/lorina.hpp>
-#include <string>
-
-/* Note: Please download this repository: https://github.com/lsils/ASPDAC2021_exp
-   and copy the folder ASPDAC2021_exp/experiments/benchmarks_aqfp/ to the build path of mockturtle. */
-int main()
-{
-  using namespace experiments;
-  using namespace mockturtle;
-
-  experiment<std::string, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t>
-      exp( "buffer_insertion_iwls", "benchmark", "#gates", "depth", "ASAP", "ALAP", "opt", "depth_JJ" );
-
-  static const std::string benchmarks_aqfp[] = {
-      /*"5xp1",*/ "c1908", "c432", "c5315", "c880", "chkn", "count", "dist", "in5", "in6", "k2",
-      "m3", "max512", "misex3", "mlp4", "prom2", "sqr6", "x1dn" };
-
-  for ( auto const& benchmark : benchmarks_aqfp )
-  {
-    uint32_t b_ASAP, b_ALAP, b_OPT;
-    fmt::print( "[i] processing {}\n", benchmark );
-    mig_network mig;
-    if ( lorina::read_verilog( "benchmarks_aqfp/" + benchmark + ".v", verilog_reader( mig ) ) != lorina::return_code::success )
-      return -1;
-
-    buffer_insertion_params ps;
-    ps.optimization_effort = buffer_insertion_params::until_sat;
-    ps.assume.splitter_capacity = 3u;
-    ps.assume.branch_pis = true;
-    ps.assume.balance_pis = false;
-    ps.assume.balance_pos = false;
-
-    buffer_insertion aqfp( mig, ps );
-
-    aqfp.ASAP();
-    aqfp.count_buffers();
-    b_ASAP = aqfp.num_buffers();
-
-    aqfp.ALAP();
-    aqfp.count_buffers();
-    b_ALAP = aqfp.num_buffers();
-
-    if ( b_ALAP > b_ASAP )
-    {
-      aqfp.ASAP(); // UNDO ALAP
-      aqfp.count_buffers();
-    }
-
-    aqfp.optimize();
-    aqfp.count_buffers();
-    b_OPT = aqfp.num_buffers();
-
-    buffered_mig_network bufntk;
-    aqfp.dump_buffered_network( bufntk );
-    depth_view d_buf{ bufntk };
-    assert( verify_aqfp_buffer( bufntk, ps.assume ) );
-
-    depth_view d{ mig };
-    exp( benchmark, mig.num_gates(), d.depth(), b_ASAP, b_ALAP, b_OPT, d_buf.depth() );
-  }
-
-  exp.save();
-  exp.table();
-
-  return 0;
-}
\ No newline at end of file
diff --git a/experiments/buffer_insertion_iwls.json b/experiments/buffer_insertion_iwls.json
deleted file mode 100644
index c6d367ed0..000000000
--- a/experiments/buffer_insertion_iwls.json
+++ /dev/null
@@ -1,160 +0,0 @@
-[
-  {
-    "entries": [
-      {
-        "#gates": 381,
-        "ALAP": 2910,
-        "ASAP": 2605,
-        "benchmark": "c1908",
-        "depth": 38,
-        "depth_JJ": 62,
-        "opt": 2202
-      },
-      {
-        "#gates": 174,
-        "ALAP": 1891,
-        "ASAP": 2423,
-        "benchmark": "c432",
-        "depth": 44,
-        "depth_JJ": 65,
-        "opt": 1673
-      },
-      {
-        "#gates": 1270,
-        "ALAP": 4197,
-        "ASAP": 6409,
-        "benchmark": "c5315",
-        "depth": 33,
-        "depth_JJ": 56,
-        "opt": 3574
-      },
-      {
-        "#gates": 300,
-        "ALAP": 1448,
-        "ASAP": 1854,
-        "benchmark": "c880",
-        "depth": 28,
-        "depth_JJ": 40,
-        "opt": 1238
-      },
-      {
-        "#gates": 421,
-        "ALAP": 785,
-        "ASAP": 1536,
-        "benchmark": "chkn",
-        "depth": 28,
-        "depth_JJ": 34,
-        "opt": 715
-      },
-      {
-        "#gates": 119,
-        "ALAP": 343,
-        "ASAP": 639,
-        "benchmark": "count",
-        "depth": 18,
-        "depth_JJ": 24,
-        "opt": 286
-      },
-      {
-        "#gates": 535,
-        "ALAP": 791,
-        "ASAP": 1066,
-        "benchmark": "dist",
-        "depth": 16,
-        "depth_JJ": 24,
-        "opt": 761
-      },
-      {
-        "#gates": 443,
-        "ALAP": 814,
-        "ASAP": 1278,
-        "benchmark": "in5",
-        "depth": 19,
-        "depth_JJ": 27,
-        "opt": 746
-      },
-      {
-        "#gates": 370,
-        "ALAP": 674,
-        "ASAP": 1002,
-        "benchmark": "in6",
-        "depth": 17,
-        "depth_JJ": 23,
-        "opt": 621
-      },
-      {
-        "#gates": 1955,
-        "ALAP": 3812,
-        "ASAP": 4512,
-        "benchmark": "k2",
-        "depth": 25,
-        "depth_JJ": 37,
-        "opt": 3249
-      },
-      {
-        "#gates": 411,
-        "ALAP": 611,
-        "ASAP": 761,
-        "benchmark": "m3",
-        "depth": 13,
-        "depth_JJ": 19,
-        "opt": 567
-      },
-      {
-        "#gates": 713,
-        "ALAP": 1081,
-        "ASAP": 1361,
-        "benchmark": "max512",
-        "depth": 17,
-        "depth_JJ": 26,
-        "opt": 1028
-      },
-      {
-        "#gates": 1532,
-        "ALAP": 2983,
-        "ASAP": 4113,
-        "benchmark": "misex3",
-        "depth": 24,
-        "depth_JJ": 34,
-        "opt": 2811
-      },
-      {
-        "#gates": 462,
-        "ALAP": 645,
-        "ASAP": 839,
-        "benchmark": "mlp4",
-        "depth": 16,
-        "depth_JJ": 23,
-        "opt": 603
-      },
-      {
-        "#gates": 3477,
-        "ALAP": 5435,
-        "ASAP": 6777,
-        "benchmark": "prom2",
-        "depth": 22,
-        "depth_JJ": 33,
-        "opt": 5259
-      },
-      {
-        "#gates": 138,
-        "ALAP": 225,
-        "ASAP": 287,
-        "benchmark": "sqr6",
-        "depth": 13,
-        "depth_JJ": 17,
-        "opt": 200
-      },
-      {
-        "#gates": 152,
-        "ALAP": 399,
-        "ASAP": 453,
-        "benchmark": "x1dn",
-        "depth": 14,
-        "depth_JJ": 19,
-        "opt": 362
-      }
-    ],
-    "version": "5becf81"
-  }
-]
diff --git a/include/mockturtle/algorithms/aqfp/aqfp_assumptions.hpp b/include/mockturtle/algorithms/aqfp/aqfp_assumptions.hpp
index fe29ebe46..b8f22e3d7 100644
--- a/include/mockturtle/algorithms/aqfp/aqfp_assumptions.hpp
+++ b/include/mockturtle/algorithms/aqfp/aqfp_assumptions.hpp
@@ -35,24 +35,104 @@
 namespace mockturtle
 {
 
+/*! \brief More realistic AQFP technology assumptions. */
+struct aqfp_assumptions_realistic
+{
+  /*! \brief Whether CIs and COs need to be path-balanced. */
+  bool balance_cios{ false };
+
+  /*! \brief Ignores the complementations of COs because they can be merged into register inputs. */
+  bool ignore_co_negation{ true };
+
+  /*! \brief Number of phases per clock cycle (for phase alignment).
+   *
+   * Each CO (a node with external reference) must be scheduled at a level being a multiple of
+   * `num_phases` (i.e., an imaginary CO node should be placed at a level `num_phases * k + 1`).
+   */
+  uint32_t num_phases{ 4u };
+
+  /*! \brief The maximum number of fanouts a splitter/buffer can have. */
+  uint32_t splitter_capacity{ 3u };
+
+  /*! \brief The maximum number of fanouts a mega splitter can have. */
+  //uint32_t mega_splitter_capacity{ 7u };
+
+  /*! \brief The maximum number of fanouts a CI can have. */
+  uint32_t ci_capacity{ 1u }; // simplicity
+  //uint32_t ci_capacity{ 2u }; // best possible
+
+  /*! \brief The phase offsets (after a change in register input) when new register output is available.
+   *
+   * Assumes that the register inputs (D and E) are scheduled at phase 0 (i.e., the last phase of
+   * the previous clock cycle), a new state is available to be taken at these numbers of phases
+   * afterwards.
+   *
+   * An ascending order is assumed. At least one element should be given.
+   *
+   * Each CI must be scheduled at a level `num_phases * k + ci_phases[i]` (for any `i`; for any
+   * integer `k >= 0` when `balance_cios = false`, or `k=0` otherwise).
+   */
+  std::vector<uint32_t> ci_phases{ { 4u } }; // simplicity
+  //std::vector<uint32_t> ci_phases{ { 3u, 4u, 5u } }; // best possible
+
+  /*! \brief Maximum phase-skip (in consideration of clock skew). */
+  uint32_t max_phase_skip{ 4u };
+};
+
 /*! \brief AQFP technology assumptions.
  *
  * POs count toward the fanout sizes and always have to be branched.
  * If PIs need to be balanced, then they must also need to be branched.
  */
-struct aqfp_assumptions
+struct aqfp_assumptions_legacy
 {
   /*! \brief Whether PIs need to be branched with splitters. */
-  bool branch_pis{ false };
+  bool branch_pis{ true };
 
   /*! \brief Whether PIs need to be path-balanced. */
   bool balance_pis{ false };
 
   /*! \brief Whether POs need to be path-balanced. */
-  bool balance_pos{ true };
+  bool balance_pos{ false };
 
   /*! \brief The maximum number of fanouts each splitter (buffer) can have. */
   uint32_t splitter_capacity{ 3u };
 };
 
+using aqfp_assumptions = aqfp_assumptions_legacy;
+
+/* Temporary helper function to bridge old and new code. */
+inline aqfp_assumptions_realistic legacy_to_realistic( aqfp_assumptions_legacy const& legacy )
+{
+  aqfp_assumptions_realistic realistic;
+
+  if ( !legacy.branch_pis )
+  {
+    realistic.ci_capacity = std::numeric_limits<uint32_t>::max();
+  }
+  else
+  {
+    realistic.ci_capacity = 1u;
+  }
+
+  if ( legacy.balance_pis && legacy.balance_pos )
+  {
+    realistic.balance_cios = true;
+  }
+  else if ( !legacy.balance_pis && !legacy.balance_pos )
+  {
+    realistic.balance_cios = false;
+  }
+  else
+  {
+    std::cerr << "[e] Cannot convert this combinaiton of assumptions.\n";
+  }
+
+  realistic.splitter_capacity = legacy.splitter_capacity;
+  realistic.num_phases = 1u; // no phase alignment
+  realistic.ci_phases = {0u}; // PIs at level 0
+  realistic.max_phase_skip = std::numeric_limits<uint32_t>::max(); // no clock skew issue
+  return realistic;
+}
+
 } // namespace mockturtle
diff --git a/include/mockturtle/algorithms/aqfp/aqfp_rebuild.hpp b/include/mockturtle/algorithms/aqfp/aqfp_rebuild.hpp
index 657ca2e99..880d7c22b 100644
--- a/include/mockturtle/algorithms/aqfp/aqfp_rebuild.hpp
+++ b/include/mockturtle/algorithms/aqfp/aqfp_rebuild.hpp
@@ -109,29 +109,24 @@ class aqfp_reconstruct_impl
 
     /* compute the node level on the new network */
     node_map<uint32_t, aqfp_network> levels( clean_ntk );
+    _ntk.foreach_gate( [&]( auto const& n ) {
+      levels[old2new[n]] = ntk_level.level( n );
+    } );
 
-    if ( _ps.buffer_insertion_ps.assume.branch_pis )
+    uint32_t max_po_level = 0;
+    clean_ntk.foreach_po( [&]( auto const& f ){
+      uint32_t spl = std::ceil( std::log( clean_ntk.fanout_size( clean_ntk.get_node( f ) ) ) / std::log( _ps.buffer_insertion_ps.assume.splitter_capacity ) );
+      max_po_level = std::max( max_po_level, levels[f] + spl );
+    });
+    std::vector<uint32_t> po_levels;
+    for ( auto i = 0u; i < _ntk.num_pos(); ++i )
     {
-      /* gates are in a fixed position */
-      _ntk.foreach_gate( [&]( auto const& n ) {
-        levels[old2new[n]] = ntk_level.level( n );
-      } );
-    }
-    else
-    {
-      /* gates are not in a fixed position */
-      /* gates are scheduled ALAP */
-
-      /* if not balance POs, POs are scheduled ASAP */
-      auto const levels_guess = schedule_buffered_network( _ntk, _ps.buffer_insertion_ps.assume );
-      _ntk.foreach_gate( [&]( auto const& n ) {
-        levels[old2new[n]] = levels_guess[n];
-      } );
+      po_levels.emplace_back( max_po_level + 1 );
     }
 
     /* recompute splitter trees and return the new buffered network */
     buffered_aqfp_network res;
-    buffer_insertion buf_inst( clean_ntk, levels, _ps.buffer_insertion_ps );
+    buffer_insertion buf_inst( clean_ntk, levels, po_levels, _ps.buffer_insertion_ps );
     _st.num_buffers = buf_inst.run( res );
     return res;
   }
diff --git a/include/mockturtle/algorithms/aqfp/aqfp_retiming.hpp b/include/mockturtle/algorithms/aqfp/aqfp_retiming.hpp
index 651af47f9..b7ff4ef94 100644
--- a/include/mockturtle/algorithms/aqfp/aqfp_retiming.hpp
+++ b/include/mockturtle/algorithms/aqfp/aqfp_retiming.hpp
@@ -140,7 +140,7 @@ class aqfp_retiming_impl
       rps.iterations = 1;
 
     buffer_insertion_params buf_ps;
-    buf_ps.assume = _ps.aqfp_assumptions_ps;
+    buf_ps.assume = legacy_to_realistic( _ps.aqfp_assumptions_ps );
     buf_ps.scheduling = buffer_insertion_params::provided;
     buf_ps.optimization_effort = buffer_insertion_params::none;
     aqfp_reconstruct_params reconstruct_ps;
diff --git a/include/mockturtle/algorithms/aqfp/buffer_insertion.hpp b/include/mockturtle/algorithms/aqfp/buffer_insertion.hpp
index 793a3f87e..52a2a92ad 100644
--- a/include/mockturtle/algorithms/aqfp/buffer_insertion.hpp
+++ b/include/mockturtle/algorithms/aqfp/buffer_insertion.hpp
@@ -57,7 +57,7 @@ namespace mockturtle
 struct buffer_insertion_params
 {
   /*! \brief Technology assumptions. */
-  aqfp_assumptions assume;
+  aqfp_assumptions_realistic assume;
 
   /*! \brief The scheduling strategy to get the initial depth assignment.
    * - `provided` = An initial level assignment is given in the constructor, thus
@@ -139,10 +139,6 @@ struct buffer_insertion_params
       mig_network mig = ...
 
       buffer_insertion_params ps;
-      ps.assume.branch_pis = true;
-      ps.assume.balance_pis = false;
-      ps.assume.balance_pos = true;
-      ps.assume.splitter_capacity = 3u;
       ps.scheduling = buffer_insertion_params::ALAP;
       ps.optimization_effort = buffer_insertion_params::one_pass;
 
@@ -178,7 +174,7 @@ class buffer_insertion
   using signal = typename Ntk::signal;
 
   explicit buffer_insertion( Ntk const& ntk, buffer_insertion_params const& ps = {} )
-      : _ntk( ntk ), _ps( ps ), _levels( _ntk ), _timeframes( _ntk ), _fanouts( _ntk ), _external_ref_count( _ntk ), _external_ref_count_neg( _ntk ), _num_buffers( _ntk ), _min_level( _ntk ), _max_level( _ntk )
+      : _ntk( ntk ), _ps( ps ), _levels( _ntk ), _po_levels( _ntk.num_pos(), 0u ), _timeframes( _ntk ), _fanouts( _ntk ), _num_buffers( _ntk )
   {
     static_assert( !is_buffered_network_type_v<Ntk>, "Ntk is already buffered" );
     static_assert( has_foreach_node_v<Ntk>, "Ntk does not implement the foreach_node method" );
@@ -194,13 +190,15 @@ class buffer_insertion
     static_assert( has_set_visited_v<Ntk>, "Ntk does not implement the set_visited method" );
     static_assert( has_set_value_v<Ntk>, "Ntk does not implement the set_value method" );
 
-    assert( !( _ps.assume.balance_pis && !_ps.assume.branch_pis ) && "Does not make sense to balance but not branch PIs" );
     assert( _ps.scheduling != buffer_insertion_params::provided );
-    initialize_external_ref_counts();
+
+    // checks for assumptions
+    assert( _ps.assume.ci_phases.size() > 0 );
+    assert( _ps.assume.ignore_co_negation ); // consideration of CO negation is too complicated and neglected for now
   }
 
-  explicit buffer_insertion( Ntk const& ntk, node_map<uint32_t, Ntk> const& levels, buffer_insertion_params const& ps = {} )
-      : _ntk( ntk ), _ps( ps ), _levels( levels ), _timeframes( _ntk ), _fanouts( _ntk ), _external_ref_count( _ntk ), _external_ref_count_neg( _ntk ), _num_buffers( _ntk ), _min_level( _ntk ), _max_level( _ntk )
+  explicit buffer_insertion( Ntk const& ntk, node_map<uint32_t, Ntk> const& levels, std::vector<uint32_t> const& po_levels, buffer_insertion_params const& ps = {} )
+      : _ntk( ntk ), _ps( ps ), _levels( levels ), _po_levels( po_levels ), _timeframes( _ntk ), _fanouts( _ntk ), _num_buffers( _ntk )
   {
     static_assert( !is_buffered_network_type_v<Ntk>, "Ntk is already buffered" );
     static_assert( has_foreach_node_v<Ntk>, "Ntk does not implement the foreach_node method" );
@@ -216,23 +214,41 @@ class buffer_insertion
     static_assert( has_set_visited_v<Ntk>, "Ntk does not implement the set_visited method" );
     static_assert( has_set_value_v<Ntk>, "Ntk does not implement the set_value method" );
 
-    assert( !( _ps.assume.balance_pis && !_ps.assume.branch_pis ) && "Does not make sense to balance but not branch PIs" );
     assert( _ps.scheduling == buffer_insertion_params::provided );
-    initialize_external_ref_counts();
+    assert( _po_levels.size() == _ntk.num_pos() );
+  }
+
+  /*! \brief Insert buffers and obtain a buffered network.
+   *
+   * \param bufntk An empty network of an appropriate buffered network type to
+   * to store the buffer-insertion result
+   * \return The number of buffers in the resulting network
+   */
+  template<class BufNtk>
+  uint32_t run( BufNtk& bufntk )
+  {
+    dry_run();
+    dump_buffered_network( bufntk );
+    return num_buffers();
   }
 
   /*! \brief Insert buffers and obtain a buffered network.
+   *
+   * It is suggested to write the `pi_levels` information into a dumped file
+   * for easier recovery of the scheduled phase assignment.
+   *
    * \param bufntk An empty network of an appropriate buffered network type to
    * to store the buffer-insertion result
-   * \param pLevels A pointer to a node map which will store the resulting
-   * level assignment
+   * \param pi_lvls A vector which will store the PI level assignment (it is
+   * recommended to store this information together with the buffered network)
    * \return The number of buffers in the resulting network
    */
   template<class BufNtk>
-  uint32_t run( BufNtk& bufntk, node_map<uint32_t, Ntk>* pLevels = nullptr )
+  uint32_t run( BufNtk& bufntk, std::vector<uint32_t>& pi_lvls )
   {
-    dry_run( pLevels );
+    dry_run();
     dump_buffered_network( bufntk );
+    pi_lvls = pi_levels();
     return num_buffers();
   }
 
@@ -242,23 +258,26 @@ class buffer_insertion
    * allows users to experiment on the algorithms with new network types whose
    * corresponding buffered_network are not implemented yet.
    *
-   * \param pLevels A pointer to a node map which will store the resulting
-   * level assignment
+   * `pLevels` and `pPOLevels` can be used to create another `buffer_insertion` instance of
+   * the same state (current schedule), which also define a unique buffered network. (Set
+   * `ps.scheduling = provided` and `ps.optimization_effort = none`)
+   *
    * \return The number of buffers in the resulting network
    */
-  uint32_t dry_run( node_map<uint32_t, Ntk>* pLevels = nullptr )
+  uint32_t dry_run()
   {
     schedule();
     optimize();
     count_buffers();
-
-    if ( pLevels )
-      *pLevels = _levels;
-
     return num_buffers();
   }
 
 #pragma region Query
+  node_map<uint32_t, Ntk> const& levels() const
+  {
+    return _levels;
+  }
+
   /*! \brief Level of node `n` considering buffer/splitter insertion. */
   uint32_t level( node const& n ) const
   {
@@ -266,13 +285,34 @@ class buffer_insertion
     return _levels[n];
   }
 
+  std::vector<uint32_t> const& po_levels() const
+  {
+    return _po_levels;
+  }
+
+  /*! \brief Level of the `idx`-th PO (imaginary dummy PO node, not counted in depth). */
+  uint32_t po_level( uint32_t idx ) const
+  {
+    assert( idx < _ntk.num_pos() );
+    return _po_levels[idx];
+  }
+
+  std::vector<uint32_t> pi_levels() const
+  {
+    std::vector<uint32_t> lvls;
+    _ntk.foreach_pi( [&]( auto n ){
+      lvls.emplace_back( _levels[n] );
+    } );
+    return lvls;
+  }
+
   /*! \brief Network depth considering AQFP buffers/splitters.
    *
-   * Note that when neither PIs nor POs are balanced, there can be
-   * different schedulings for the same buffered network (i.e. having
-   * the same number of buffers), thus this number may be different
-   * from the depth obtained by dumping the buffered network and wrapping
-   * depth_view around it.
+   * Should be equal to `max( po_level(i) - 1 )`.
+   *
+   * This is the number of phases from the previous-stage register to the
+   * next-stage register, including the depth of the previous-stage register
+   * (i.e., from one register input to the next register input).
    */
   uint32_t depth() const
   {
@@ -286,15 +326,9 @@ class buffer_insertion
     assert( !_outdated && "Please call `count_buffers()` first." );
 
     uint32_t count = 0u;
-    if ( _ps.assume.branch_pis )
-    {
-      _ntk.foreach_pi( [&]( auto const& n ) {
+    _ntk.foreach_node( [&]( auto const& n ) {
+      if ( !_ntk.is_constant( n ) )
         count += num_buffers( n );
-      } );
-    }
-
-    _ntk.foreach_gate( [&]( auto const& n ) {
-      count += num_buffers( n );
     } );
     return count;
   }
@@ -307,8 +341,8 @@ class buffer_insertion
     return _num_buffers[n];
   }
 
-  /*! \brief The choosen schedule is ASAP */
-  uint32_t is_scheduled_ASAP() const
+  /*! \brief The chosen schedule is ASAP */
+  bool is_scheduled_ASAP() const
   {
     return _is_scheduled_ASAP;
   }
@@ -328,21 +362,14 @@ class buffer_insertion
       update_fanout_info();
     }
 
-    if ( _ps.assume.branch_pis )
-    {
-      _ntk.foreach_pi( [&]( auto const& n ) {
-        assert( !_ps.assume.balance_pis || _levels[n] == 0 );
+    _ntk.foreach_node( [&]( auto const& n ) {
+      if ( !_ntk.is_constant( n ) )
         _num_buffers[n] = count_buffers( n );
-      } );
-    }
-
-    _ntk.foreach_gate( [&]( auto const& n ) {
-      _num_buffers[n] = count_buffers( n );
     } );
   }
 
 private:
-  uint32_t count_buffers( node const& n )
+  uint32_t count_buffers( node const& n ) const
   {
     assert( !_outdated && "Please call `update_fanout_info()` first." );
     auto const& fo_infos = _fanouts[n];
@@ -356,78 +383,29 @@ class buffer_insertion
 
     if ( _ntk.fanout_size( n ) == 1u ) /* single fanout */
     {
-      if ( _external_ref_count[n] == 1u && !_ps.assume.balance_pos )
-        return 0u;
-      else
-        return fo_infos.front().relative_depth - 1u;
-    }
-
-    /* special case: don't balance POs; multiple PO refs but no gate fanout */
-    if ( !_ps.assume.balance_pos && _ntk.fanout_size( n ) == _external_ref_count[n] )
-    {
-      /* have both polarities */
-      if ( _external_ref_count_neg[n] > 0 && _external_ref_count[n] > _external_ref_count_neg[n] )
-        return std::ceil( float( _external_ref_count[n] - _external_ref_count_neg[n] - 1 ) / float( _ps.assume.splitter_capacity - 1 ) ) + std::ceil( float( _external_ref_count_neg[n] - 1 ) / float( _ps.assume.splitter_capacity - 1 ) ) + 1;
-      else
-        return std::ceil( float( _external_ref_count[n] - 1 ) / float( _ps.assume.splitter_capacity - 1 ) );
+      assert( fo_infos.size() == 1u );
+      return fo_infos.front().relative_depth - 1u;
     }
 
-    assert( fo_infos.size() > 1u );
-    uint32_t count{ 0u };
-
-    /* special case: don't balance POs; have both gate fanout(s) and PO ref(s) */
-    if ( !_ps.assume.balance_pos && _external_ref_count[n] > 0u )
+    if ( _ps.assume.ci_capacity > 1 && _ntk.is_pi( n ) )
     {
-      /* hacky (rare?) case */
-      /*fmt::print( "[w] hacky case: node {} has {} fanouts, including {} gates, {} positive PO refs and {} negative PO refs.\n",
-                  n, _ntk.fanout_size( n ), _ntk.fanout_size( n ) - _external_ref_count[n],
-                  _external_ref_count[n] - _external_ref_count_neg[n], _external_ref_count_neg[n] );*/
-
-      /* count ignoring POs */
-      auto rit = fo_infos.rbegin();
-      assert( rit->fanouts.size() == 0 );
-      while ( rit->fanouts.size() == 0 )
-        ++rit;
-      auto nedges = rit->fanouts.size();
-      auto prev_rd = rit->relative_depth;
-      for ( ++rit; rit != fo_infos.rend(); ++rit )
-      {
-        nedges = num_splitters( nedges );
-        count += nedges + prev_rd - rit->relative_depth - 1;
-        nedges += rit->fanouts.size();
-        prev_rd = rit->relative_depth;
-      }
-      assert( nedges == 1 );
-
-      /* check if available slots in the remaining buffers are enough for POs */
-      auto slots = count * ( _ps.assume.splitter_capacity - 1 ) + 1;
-      int32_t needed_pos = _ntk.fanout_size( n ) - _external_ref_count_neg[n] - slots;
-      if ( _external_ref_count_neg[n] > 0 )
-        ++needed_pos;
-      if ( needed_pos > 0 )
+      if ( fo_infos.size() == 1u )
       {
-        count += std::ceil( float( needed_pos ) / float( _ps.assume.splitter_capacity - 1 ) );
-      }
-      if ( _external_ref_count_neg[n] > 0 )
-      {
-        count += std::max<uint32_t>( std::ceil( float( _external_ref_count_neg[n] - 1 ) / float( _ps.assume.splitter_capacity - 1 ) ), 1 );
+        assert( fo_infos.front().relative_depth == 1u );
+        return 0u;
       }
-
-      return count;
     }
 
-    /* main counting */
+    assert( fo_infos.size() > 1u );
     auto it = fo_infos.begin();
-    count = it->num_edges;
-    auto rd = it->relative_depth;
+    uint32_t count = it->num_edges - it->fanouts.size() - it->extrefs.size();
+    uint32_t rd = it->relative_depth;
     for ( ++it; it != fo_infos.end(); ++it )
     {
-      count += it->num_edges - it->fanouts.size() + it->relative_depth - rd - 1;
+      count += it->num_edges - it->fanouts.size() - it->extrefs.size() + it->relative_depth - rd - 1;
       rd = it->relative_depth;
     }
 
-    /* PO refs were added as num_edges and counted as buffers */
-    count -= _external_ref_count[n];
     return count;
   }
 
@@ -435,90 +413,37 @@ class buffer_insertion
   uint32_t num_splitter_levels( node const& n ) const
   {
     assert( n < _ntk.size() );
+    if ( _ntk.is_pi( n ) )
+    {
+      if ( _ntk.fanout_size( n ) > _ps.assume.ci_capacity )
+        return std::ceil( std::log( _ntk.fanout_size( n ) - _ps.assume.ci_capacity + 1 ) / std::log( _ps.assume.splitter_capacity ) );
+      else
+        return 0u;
+    }
     return std::ceil( std::log( _ntk.fanout_size( n ) ) / std::log( _ps.assume.splitter_capacity ) );
   }
 
-  uint32_t num_splitter_levels_po( node const& n ) const
-  {
-    assert( n < _ntk.size() );
-    if ( _ntk.fanout_size( n ) == 1 )
-      return 0;
-    if ( _external_ref_count_neg[n] == 0 || _external_ref_count_neg[n] == _external_ref_count[n] )
-      return std::ceil( std::log( _ntk.fanout_size( n ) ) / std::log( _ps.assume.splitter_capacity ) );
-    return std::max( std::ceil( std::log( _ntk.fanout_size( n ) - _external_ref_count_neg[n] ) / std::log( _ps.assume.splitter_capacity ) ),
-                     std::ceil( std::log( _external_ref_count_neg[n] ) / std::log( _ps.assume.splitter_capacity ) ) ) +
-           1; // suboptimal
-  }
-#pragma endregion
-
-private:
-#pragma region Update fanout info
-  void initialize_external_ref_counts()
-  {
-    _ntk.foreach_po( [&]( auto const& f ) {
-      if ( !_ntk.is_constant( _ntk.get_node( f ) ) )
-      {
-        _external_ref_count[f]++;
-        if ( _ntk.is_complemented( f ) )
-        {
-          _external_ref_count_neg[f]++;
-        }
-      }
-    } );
-  }
-
   /* Update fanout_information of all nodes */
   void update_fanout_info()
   {
     _fanouts.reset();
+
     _ntk.foreach_gate( [&]( auto const& n ) {
       _ntk.foreach_fanin( n, [&]( auto const& fi ) {
         auto const ni = _ntk.get_node( fi );
         if ( !_ntk.is_constant( ni ) )
-        {
           insert_fanout( ni, n );
-        }
       } );
     } );
 
-    _ntk.foreach_node( [&]( auto const& n ) {
-      if ( !_ps.assume.branch_pis && _ntk.is_pi( n ) )
-        return true;
-      if ( _external_ref_count[n] > 0u )
-        _fanouts[n].push_back( { _depth + 1 - _levels[n], {}, _external_ref_count[n] } );
-      return true;
-    } );
-
-    /* //debugging checks
-    if ( !_ps.assume.branch_pis )
-    {
-      _ntk.foreach_pi( [&]( auto const& n ) { assert( _fanouts[n].size() == 0 ); });
-      _ntk.foreach_gate( [&]( auto const& n ) {
-        if ( _ntk.fanout_size( n ) == 1 ) assert( _fanouts[n].size() == 1 );
-        else assert( _fanouts[n].front().relative_depth > 1 );
-      });
-    }
-    else
-    {
-      _ntk.foreach_node( [&]( auto const& n ) {
-        if ( _ntk.is_constant( n ) || _ntk.fanout_size( n ) == 0 ) return true;
-        if ( _ntk.fanout_size( n ) == 1 ) assert( _fanouts[n].size() == 1 );
-        else assert( _fanouts[n].front().relative_depth > 1 );
-        return true;
-      });
-    }
-    */
-
-    _ntk.foreach_gate( [&]( auto const& n ) {
-      count_edges( n );
+    _ntk.foreach_po( [&]( auto const& f, auto i ){
+      insert_extref( _ntk.get_node( f ), i );
     } );
 
-    if ( _ps.assume.branch_pis )
-    {
-      _ntk.foreach_pi( [&]( auto const& n ) {
+    _ntk.foreach_node( [&]( auto const& n ) {
+      if ( !_ntk.is_constant( n ) )
         count_edges( n );
-      } );
-    }
+    } );
 
     _outdated = false;
   }
@@ -528,6 +453,7 @@ class buffer_insertion
   bool update_fanout_info( node const& n )
   {
     std::vector<node> fos;
+    std::vector<uint32_t> extrefs;
     for ( auto it = _fanouts[n].begin(); it != _fanouts[n].end(); ++it )
     {
       if ( it->fanouts.size() )
@@ -535,24 +461,26 @@ class buffer_insertion
         for ( auto it2 = it->fanouts.begin(); it2 != it->fanouts.end(); ++it2 )
           fos.push_back( *it2 );
       }
+      if ( it->extrefs.size() )
+      {
+        for ( auto it2 = it->extrefs.begin(); it2 != it->extrefs.end(); ++it2 )
+          extrefs.push_back( *it2 );
+      }
     }
 
     _fanouts[n].clear();
     for ( auto& fo : fos )
       insert_fanout( n, fo );
-
-    if ( _external_ref_count[n] > 0u )
-      _fanouts[n].push_back( { _depth + 1 - _levels[n], {}, _external_ref_count[n] } );
+    for ( auto& po : extrefs )
+      insert_extref( n, po );
 
     return count_edges<verify>( n );
   }
 
   void insert_fanout( node const& n, node const& fanout )
   {
-    if ( !_ps.assume.branch_pis && _ntk.is_pi( n ) )
-      return;
+    assert( _levels[fanout] > _levels[n] );
     auto const rd = _levels[fanout] - _levels[n];
-    assert( rd > 0 );
     auto& fo_infos = _fanouts[n];
     for ( auto it = fo_infos.begin(); it != fo_infos.end(); ++it )
     {
@@ -564,11 +492,33 @@ class buffer_insertion
       }
       else if ( it->relative_depth > rd )
       {
-        fo_infos.insert( it, { rd, { fanout }, 1u } );
+        fo_infos.insert( it, { rd, { fanout }, {}, 1u } );
+        return;
+      }
+    }
+    fo_infos.push_back( { rd, { fanout }, {}, 1u } );
+  }
+
+  void insert_extref( node const& n, uint32_t idx )
+  {
+    assert( _po_levels[idx] > _levels[n] );
+    auto const rd = _po_levels[idx] - _levels[n];
+    auto& fo_infos = _fanouts[n];
+    for ( auto it = fo_infos.begin(); it != fo_infos.end(); ++it )
+    {
+      if ( it->relative_depth == rd )
+      {
+        it->extrefs.push_back( idx );
+        ++it->num_edges;
+        return;
+      }
+      else if ( it->relative_depth > rd )
+      {
+        fo_infos.insert( it, { rd, {}, {idx}, 1u } );
         return;
       }
     }
-    fo_infos.push_back( { rd, { fanout }, 1u } );
+    fo_infos.push_back( { rd, {}, {idx}, 1u } );
   }
 
   template<bool verify = false>
@@ -580,41 +530,44 @@ class buffer_insertion
     {
       return true;
     }
-    assert( fo_infos.front().relative_depth > 1u );
-    fo_infos.push_front( { 1u, {}, 0u } );
+
+    if ( _ntk.is_pi( n ) && _ps.assume.ci_capacity > 1 )
+    {
+      if ( fo_infos.front().relative_depth > 1u )
+        fo_infos.push_front( { 1u, {}, {}, 0u } );
+    }
+    else
+    {
+      assert( fo_infos.front().relative_depth > 1u );
+      fo_infos.push_front( { 1u, {}, {}, 0u } );
+    }
 
     auto it = fo_infos.end();
     --it;
     uint32_t splitters;
-    if ( _external_ref_count_neg[n] > 0 )
-      splitters = num_splitters_po( _external_ref_count[n] - _external_ref_count_neg[n], _external_ref_count_neg[n] );
-    else
-      splitters = num_splitters( it->num_edges );
-
     while ( it != fo_infos.begin() )
     {
+      splitters = num_splitters( it->num_edges );
       auto rd = it->relative_depth;
       --it;
       if ( it->relative_depth < rd - 1 && splitters > 1 )
       {
-        ++it;
-        it = fo_infos.insert( it, { rd - 1, {}, splitters } );
+        it = fo_infos.insert( ++it, { rd - 1, {}, {}, splitters } );
       }
       else
       {
         it->num_edges += splitters;
       }
-      splitters = num_splitters( it->num_edges );
     }
 
     assert( fo_infos.front().relative_depth == 1u );
     if constexpr ( verify )
     {
-      return fo_infos.front().num_edges == 1u;
+      return _ntk.is_pi( n ) ? fo_infos.front().num_edges <= _ps.assume.ci_capacity : fo_infos.front().num_edges == 1u;
     }
     else
     {
-      assert( fo_infos.front().num_edges == 1u );
+      assert( _ntk.is_pi( n ) ? fo_infos.front().num_edges <= _ps.assume.ci_capacity : fo_infos.front().num_edges == 1u );
       return true;
     }
   }
@@ -624,34 +577,51 @@ class buffer_insertion
   {
     return std::ceil( float( num_fanouts ) / float( _ps.assume.splitter_capacity ) );
   }
-
-  uint32_t num_splitters_po( uint32_t num_positive, uint32_t num_negative ) const
-  {
-    return std::ceil( float( num_positive ) / float( _ps.assume.splitter_capacity ) ) + std::ceil( float( num_negative ) / float( _ps.assume.splitter_capacity ) );
-  }
 #pragma endregion
 
-#pragma region Level assignment
+#pragma region Initial level assignment
 public:
   /*! \brief Obtain the initial level assignment using the specified scheduling policy */
   void schedule()
   {
     if ( _ps.scheduling == buffer_insertion_params::provided )
     {
-      _ntk.foreach_po( [&]( auto const& f ) {
-        _depth = std::max( _depth, _levels[f] + num_splitter_levels( _ntk.get_node( f ) ) );
+      _ntk.foreach_po( [&]( auto const& f, auto i ) {
+        assert( _po_levels[i] > _levels[f] );
+        _depth = std::max( _depth, _po_levels[i] - 1 );
       } );
+      assert( _depth % _ps.assume.num_phases == 0 );
+      return;
     }
-    else if ( _ps.scheduling == buffer_insertion_params::better_depth || _ps.scheduling == buffer_insertion_params::ASAP_depth || _ps.scheduling == buffer_insertion_params::ALAP_depth )
+
+    if ( _ps.scheduling == buffer_insertion_params::better_depth || _ps.scheduling == buffer_insertion_params::ASAP_depth || _ps.scheduling == buffer_insertion_params::ALAP_depth )
     {
       fanout_view<Ntk> f_ntk{ _ntk };
-      depth_optimal_schedule( f_ntk );
-    }
-    else
-    {
-      ASAP();
+      /* Optimum-depth ALAP scheduling */
+      ALAP_depth( f_ntk );
+      count_buffers();
+      auto const num_buf_ALAP_depth = num_buffers();
+
+      if ( _ps.scheduling == buffer_insertion_params::ALAP_depth )
+        return;
+
+      /* Optimum-depth ALAP scheduling: no balanced trees */
+      ASAP_depth( f_ntk, false );
+      count_buffers();
+      auto const num_buf_ASAP_depth = num_buffers();
+
+      if ( _ps.scheduling == buffer_insertion_params::ASAP_depth )
+        return;
+
+      /* Revert to optimum-depth ALAP scheduling if better */
+      if ( num_buf_ALAP_depth < num_buf_ASAP_depth )
+      {
+        ALAP_depth( f_ntk );
+      }
+      return;
     }
 
+    ASAP();
     if ( _ps.scheduling == buffer_insertion_params::ALAP )
     {
       ALAP();
@@ -676,10 +646,29 @@ class buffer_insertion
     _levels.reset( 0 );
     _ntk.incr_trav_id();
 
-    _ntk.foreach_po( [&]( auto const& f ) {
+    _ntk.foreach_po( [&]( auto const& f, auto i ) {
       auto const no = _ntk.get_node( f );
-      auto clevel = compute_levels_ASAP( no ) + num_splitter_levels_po( no );
-      _depth = std::max( _depth, clevel );
+      _po_levels[i] = compute_levels_ASAP( no ) + num_splitter_levels( no ) + 1;
+      if ( ( _po_levels[i] - 1 ) % _ps.assume.num_phases != 0 ) // phase alignment
+      {
+        _po_levels[i] += _ps.assume.num_phases - ( ( _po_levels[i] - 1 ) % _ps.assume.num_phases );
+      }
+      _depth = std::max( _depth, _po_levels[i] - 1 );
+    } );
+    assert( _depth % _ps.assume.num_phases == 0 );
+
+    if ( _ps.assume.balance_cios )
+    {
+      _ntk.foreach_po( [&]( auto const& f, auto i ) {
+        (void)f;
+        _po_levels[i] = _depth + 1;
+      } );
+    }
+
+    /* dangling PIs */
+    _ntk.foreach_pi( [&]( auto const& n ){
+      if ( _ntk.visited( n ) != _ntk.trav_id() )
+        _levels[n] = _ps.assume.ci_phases[0];
     } );
 
     _outdated = true;
@@ -694,22 +683,45 @@ class buffer_insertion
    */
   void ASAP_depth( fanout_view<Ntk> const& f_ntk, bool try_regular )
   {
-    node_map<uint32_t, Ntk> mobility( _ntk, UINT32_MAX );
+    node_map<uint32_t, Ntk> mobility( _ntk, std::numeric_limits<uint32_t>::max() );
 
-    _ntk.foreach_node( [&]( auto const& n ) {
-      if ( _ntk.is_constant( n ) || _ntk.is_pi( n ) )
-      {
-        mobility[n] = _levels[n];
-      }
+    if ( !_ps.assume.balance_cios )
+    {
+      _ntk.foreach_po( [&]( auto const& f, auto i ) {
+        (void)f;
+        _po_levels[i] = 0;
+      } );
+    }
 
+    _ntk.foreach_pi( [&]( auto const& n ) {
       if ( !_ntk.is_constant( n ) )
       {
+        mobility[n] = _levels[n] - _ps.assume.ci_phases[0];
         compute_mobility_ASAP( f_ntk, n, mobility, try_regular );
       }
+    } );
 
-      _min_level[n] = _levels[n];
+    _ntk.foreach_gate( [&]( auto const& n ) {
+      compute_mobility_ASAP( f_ntk, n, mobility, try_regular );
     } );
 
+    if ( !_ps.assume.balance_cios )
+    {
+      _ntk.foreach_po( [&]( auto const& f, auto i ) {
+        if ( _po_levels[i] == 0 )
+        {
+          assert( _ntk.is_constant( _ntk.get_node( f ) ) );
+          _po_levels[i] = 1;
+        }
+        else if ( ( _po_levels[i] - 1 ) % _ps.assume.num_phases != 0 ) // phase alignment
+        {
+          _po_levels[i] += _ps.assume.num_phases - ( ( _po_levels[i] - 1 ) % _ps.assume.num_phases );
+        }
+        _depth = std::max( _depth, _po_levels[i] - 1 );
+      } );
+      assert( _depth % _ps.assume.num_phases == 0 );
+    }
+
     _outdated = true;
     _is_scheduled_ASAP = true;
   }
@@ -720,18 +732,27 @@ class buffer_insertion
    */
   void ALAP()
   {
+    assert( _depth % _ps.assume.num_phases == 0 );
     _levels.reset( 0 );
     _ntk.incr_trav_id();
 
-    _ntk.foreach_po( [&]( auto const& f ) {
+    _ntk.foreach_po( [&]( auto const& f, auto i ) {
+      _po_levels[i] = _depth + 1;
       const auto n = _ntk.get_node( f );
-      if ( !_ntk.is_constant( n ) && _ntk.visited( n ) != _ntk.trav_id() && ( !_ps.assume.balance_pis || !_ntk.is_pi( n ) ) )
+
+      if ( !_ntk.is_constant( n ) && _ntk.visited( n ) != _ntk.trav_id() )
       {
-        _levels[n] = _depth - num_splitter_levels_po( n );
+        _levels[n] = _depth - num_splitter_levels( n );
         compute_levels_ALAP( n );
       }
     } );
 
+    /* dangling PIs */
+    _ntk.foreach_pi( [&]( auto const& n ){
+      if ( _ntk.visited( n ) != _ntk.trav_id() )
+        _levels[n] = _ps.assume.ci_phases[0];
+    } );
+
     _outdated = true;
     _is_scheduled_ASAP = false;
   }
@@ -743,65 +764,102 @@ class buffer_insertion
     topo_view<Ntk> topo_ntk{ _ntk };
 
     /* compute ALAP */
-    _depth = UINT32_MAX - 1;
-    uint32_t min_level = UINT32_MAX - 1;
+    _depth = std::numeric_limits<uint32_t>::max() - 1;
+    uint32_t min_level = std::numeric_limits<uint32_t>::max() - 1;
     topo_ntk.foreach_node_reverse( [&]( auto const& n ) {
-      if ( !_ntk.is_constant( n ) && ( _ps.assume.branch_pis || !_ntk.is_pi( n ) ) )
+      if ( !_ntk.is_constant( n ) && _ntk.fanout_size( n ) > 0 )
       {
         compute_levels_ALAP_depth( f_ntk, n );
         min_level = std::min( min_level, _levels[n] );
       }
     } );
 
-    if ( !_ps.assume.branch_pis && min_level != 0 )
-      --min_level;
+    /* move everything down by `delta` */
+    uint32_t delta = min_level;
+    /* phase alignment for PO: depth % num_phases = 0 */
+    if ( ( _depth - delta ) % _ps.assume.num_phases != 0 )
+    {
+      delta -= _ps.assume.num_phases - ( ( _depth - delta ) % _ps.assume.num_phases );
+    }
 
-    /* normalize level */
-    _ntk.foreach_node( [&]( auto const& n ) {
-      if ( !_ntk.is_constant( n ) )
-      {
-        if ( _ps.assume.balance_pis && _ntk.is_pi( n ) )
+    /* level of the lowest PI >= ci_phases[0] */
+    while ( min_level - delta < _ps.assume.ci_phases[0] )
+    {
+      delta -= _ps.assume.num_phases;
+    }
+    /* move PIs down to an acceptable level */
+    if ( _ps.assume.balance_cios )
+    {
+      _ntk.foreach_pi( [&]( auto const& n ) {
+        if ( _ntk.fanout_size( n ) == 0 )
         {
-          _levels[n] = 0;
+          _levels[n] = _ps.assume.ci_phases[0];
         }
-        else if ( !_ps.assume.balance_pis || !_ntk.is_pi( n ) )
+        else if ( !_ntk.is_constant( n ) )
         {
-          _levels[n] = _levels[n] - min_level;
+          _levels[n] = _levels[n] - delta;
+          for ( auto rit = _ps.assume.ci_phases.rbegin(); rit != _ps.assume.ci_phases.rend(); ++rit )
+          {
+            if ( *rit <= _levels[n] )
+            {
+              _levels[n] = *rit;
+              return;
+            }
+          }
+          assert( false );
         }
-        _max_level[n] = _levels[n];
-      }
+      } );
+    }
+    else
+    {
+      _ntk.foreach_pi( [&]( auto const& n ) {
+        if ( _ntk.fanout_size( n ) == 0 )
+        {
+          _levels[n] = _ps.assume.ci_phases[0];
+        }
+        else if ( !_ntk.is_constant( n ) )
+        {
+          _levels[n] = _levels[n] - delta;
+          while ( !is_acceptable_ci_lvl( _levels[n] ) )
+          {
+            assert( _levels[n] > 0 );
+            --_levels[n];
+          }
+        }
+      } );
+    }
+
+    _ntk.foreach_gate( [&]( auto const& n ) {
+      _levels[n] = _levels[n] - delta;
     } );
+    _depth -= delta;
+    assert( _depth % _ps.assume.num_phases == 0 );
+    if ( _ps.assume.balance_cios )
+    {
+      _ntk.foreach_po( [&]( auto const& f, auto i ) {
+        (void)f;
+        _po_levels[i] = _depth + 1;
+      } );
+    }
+    else
+    {
+      _ntk.foreach_po( [&]( auto const& f, auto i ) {
+        if ( _ntk.is_constant( _ntk.get_node( f ) ) )
+          _po_levels[i] = 1;
+        else
+        {
+          _po_levels[i] = _levels[f] + num_splitter_levels( _ntk.get_node( f ) );
+          if ( _po_levels[i] % _ps.assume.num_phases > 0 )
+            _po_levels[i] += _ps.assume.num_phases - ( _po_levels[i] % _ps.assume.num_phases );
+          ++_po_levels[i];
+        }
+      } );
+    }
 
-    _depth -= min_level;
     _outdated = true;
     _is_scheduled_ASAP = false;
   }
 
-  void depth_optimal_schedule( fanout_view<Ntk> const& f_ntk )
-  {
-    /* Optimum-depth ALAP scheduling */
-    ALAP_depth( f_ntk );
-    count_buffers();
-    auto const num_buf_ALAP_depth = num_buffers();
-
-    if ( _ps.scheduling == buffer_insertion_params::ALAP_depth )
-      return;
-
-    /* Optimum-depth ALAP scheduling: no balanced trees */
-    ASAP_depth( f_ntk, false );
-    count_buffers();
-    auto const num_buf_ASAP_depth = num_buffers();
-
-    if ( _ps.scheduling == buffer_insertion_params::ASAP_depth )
-      return;
-
-    /* Revert to optimum-depth ALAP scheduling if better */
-    if ( num_buf_ALAP_depth < num_buf_ASAP_depth )
-    {
-      ALAP_depth( f_ntk );
-    }
-  }
-
 private:
   uint32_t compute_levels_ASAP( node const& n )
   {
@@ -811,21 +869,21 @@ class buffer_insertion
     }
     _ntk.set_visited( n, _ntk.trav_id() );
 
-    if ( _ntk.is_constant( n ) || _ntk.is_pi( n ) )
+    if ( _ntk.is_constant( n ) )
     {
       return _levels[n] = 0;
     }
+    else if ( _ntk.is_pi( n ) )
+    {
+      return _levels[n] = _ps.assume.ci_phases[0];
+    }
 
     uint32_t level{ 0 };
     _ntk.foreach_fanin( n, [&]( auto const& fi ) {
       auto const ni = _ntk.get_node( fi );
       if ( !_ntk.is_constant( ni ) )
       {
-        auto fi_level = compute_levels_ASAP( ni );
-        if ( _ps.assume.branch_pis || !_ntk.is_pi( ni ) )
-        {
-          fi_level += num_splitter_levels( ni );
-        }
+        auto fi_level = compute_levels_ASAP( ni ) + num_splitter_levels( ni );
         level = std::max( level, fi_level );
       }
     } );
@@ -833,29 +891,70 @@ class buffer_insertion
     return _levels[n] = level + 1;
   }
 
-  void compute_levels_ALAP( node const& n )
+  bool is_acceptable_ci_lvl( uint32_t lvl ) const
   {
-    _ntk.set_visited( n, _ntk.trav_id() );
-
-    _ntk.foreach_fanin( n, [&]( auto const& fi ) {
-      auto const ni = _ntk.get_node( fi );
-      if ( !_ntk.is_constant( ni ) )
+    if ( _ps.assume.balance_cios )
+    {
+      for ( auto const& p : _ps.assume.ci_phases )
       {
-        if ( _ps.assume.balance_pis && _ntk.is_pi( ni ) )
-        {
-          assert( _levels[n] > 0 );
-          _levels[ni] = 0;
-        }
-        else if ( _ps.assume.branch_pis || !_ntk.is_pi( ni ) )
-        {
-          assert( _levels[n] > num_splitter_levels( ni ) );
-          auto fi_level = _levels[n] - num_splitter_levels( ni ) - 1;
-          if ( _ntk.visited( ni ) != _ntk.trav_id() || _levels[ni] > fi_level )
-          {
-            _levels[ni] = fi_level;
-            compute_levels_ALAP( ni );
-          }
-        }
+        if ( lvl == p )
+          return true;
+      }
+      return false;
+    }
+    else
+    {
+      for ( auto const& p : _ps.assume.ci_phases )
+      {
+        // for example, if num_phases = 4, ci_phases = {5},
+        // then lvl = 1 will not be acceptable, but lvl = 5 or lvl = 9 will
+        if ( lvl % _ps.assume.num_phases == p % _ps.assume.num_phases && lvl >= p )
+          return true;
+      }
+      return false;
+    }
+  }
+
+  void compute_levels_ALAP( node const& n )
+  {
+    _ntk.set_visited( n, _ntk.trav_id() );
+
+    if ( _ntk.is_pi( n ) )
+    {
+      if ( _ps.assume.balance_cios )
+      {
+        for ( auto rit = _ps.assume.ci_phases.rbegin(); rit != _ps.assume.ci_phases.rend(); ++rit )
+        {
+          if ( *rit <= _levels[n] )
+          {
+            _levels[n] = *rit;
+            return;
+          }
+        }
+        assert( false );
+      }
+      else
+      {
+        while ( !is_acceptable_ci_lvl( _levels[n] ) )
+        {
+          assert( _levels[n] > 0 );
+          --_levels[n];
+        }
+      }
+      return;
+    }
+
+    _ntk.foreach_fanin( n, [&]( auto const& fi ) {
+      auto const ni = _ntk.get_node( fi );
+      if ( !_ntk.is_constant( ni ) )
+      {
+        assert( _levels[n] > num_splitter_levels( ni ) );
+        auto fi_level = _levels[n] - num_splitter_levels( ni ) - 1;
+        if ( _ntk.visited( ni ) != _ntk.trav_id() || _levels[ni] > fi_level )
+        {
+          _levels[ni] = fi_level;
+          compute_levels_ALAP( ni );
+        }
       }
     } );
   }
@@ -874,13 +973,6 @@ class buffer_insertion
       level_assignment.push_back( _levels[f] );
     } );
 
-    /* dangling PI */
-    if ( level_assignment.empty() )
-    {
-      _levels[n] = _depth;
-      return;
-    }
-
     /* sort by descending order of levels */
     std::sort( level_assignment.begin(), level_assignment.end(), std::greater<uint32_t>() );
 
@@ -906,10 +998,21 @@ class buffer_insertion
 
     /* search for a feasible level for node n */
     --last_level;
-    while ( nodes_in_level > 1 )
+    if ( _ntk.is_pi( n ) )
     {
-      nodes_in_level = std::ceil( float( nodes_in_level ) / float( _ps.assume.splitter_capacity ) );
-      --last_level;
+      while ( nodes_in_level > _ps.assume.ci_capacity )
+      {
+        nodes_in_level = std::ceil( float( nodes_in_level ) / float( _ps.assume.splitter_capacity ) );
+        --last_level;
+      }
+    }
+    else
+    {
+      while ( nodes_in_level > 1 )
+      {
+        nodes_in_level = std::ceil( float( nodes_in_level ) / float( _ps.assume.splitter_capacity ) );
+        --last_level;
+      }
     }
 
     _levels[n] = last_level;
@@ -917,18 +1020,11 @@ class buffer_insertion
 
   void compute_mobility_ASAP( fanout_view<Ntk> const& ntk, node const& n, node_map<uint32_t, Ntk>& mobility, bool try_regular )
   {
+    assert( mobility[n] <= _levels[n] );
     /* commit ASAP scheduling */
     uint32_t level_n = _levels[n] - mobility[n];
     _levels[n] = level_n;
 
-    if ( !_ps.assume.branch_pis && _ntk.is_pi( n ) )
-    {
-      ntk.foreach_fanout( n, [&]( auto const& f ) {
-        mobility[f] = std::min( mobility[f], _levels[f] - level_n - 1 );
-      } );
-      return;
-    }
-
     /* try to fit a balanced tree */
     if ( try_regular )
     {
@@ -954,8 +1050,14 @@ class buffer_insertion
     level_assignment.reserve( _ntk.fanout_size( n ) );
 
     /* if node is a PO, add levels */
-    for ( auto i = ntk.fanout( n ).size(); i < ntk.fanout_size( n ); ++i )
-      level_assignment.push_back( { 0, _depth + 1, 0 } );
+    if ( ntk.fanout( n ).size() < ntk.fanout_size( n ) )
+    {
+      ntk.foreach_po( [&]( auto const& f, auto i ){
+        if ( ntk.get_node( f ) == n )
+          level_assignment.push_back( { i, _depth + 1, 0 } );
+      } );
+      assert( level_assignment.size() == ntk.fanout_size( n ) - ntk.fanout( n ).size() );
+    }
 
     /* get fanout levels */
     ntk.foreach_fanout( n, [&]( auto const& f ) {
@@ -1012,7 +1114,7 @@ class buffer_insertion
     uint32_t mobility_update = 0;
     for ( auto i = level_n + 1; i < last_level; ++i )
     {
-      if ( nodes_in_level == 1 )
+      if ( nodes_in_level == 1 || ( _ntk.is_pi( n ) && nodes_in_level <= _ps.assume.ci_capacity ) )
         ++mobility_update;
       nodes_in_level = std::ceil( float( nodes_in_level ) / float( _ps.assume.splitter_capacity ) );
     }
@@ -1020,98 +1122,29 @@ class buffer_insertion
     /* update mobilities */
     for ( auto const& v : level_assignment )
     {
-      if ( v[0] != 0 )
+      if ( v[1] != _depth + 1 )
       {
         mobility[v[0]] = std::min( mobility[v[0]], v[2] + mobility_update );
       }
     }
-  }
-#pragma endregion
 
-#pragma region Compute timeframe
-  /*! \brief Compute the earliest and latest possible timeframe by eager ASAP and ALAP */
-  uint32_t compute_timeframe( uint32_t max_depth )
-  {
-    _timeframes.reset( std::make_pair( 0, 0 ) );
-    uint32_t min_depth{ 0 };
-
-    _ntk.incr_trav_id();
-    _ntk.foreach_po( [&]( auto const& f ) {
-      auto const no = _ntk.get_node( f );
-      auto clevel = compute_levels_ASAP_eager( no ) + ( _ntk.fanout_size( no ) > 1 ? 1 : 0 );
-      min_depth = std::max( min_depth, clevel );
-    } );
-
-    _ntk.incr_trav_id();
-    _ntk.foreach_po( [&]( auto const& f ) {
-      const auto n = _ntk.get_node( f );
-      if ( !_ntk.is_constant( n ) && _ntk.visited( n ) != _ntk.trav_id() && ( !_ps.assume.balance_pis || !_ntk.is_pi( n ) ) )
-      {
-        _timeframes[n].second = max_depth - ( _ntk.fanout_size( n ) > 1 ? 1 : 0 );
-        compute_levels_ALAP_eager( n );
-      }
-    } );
-
-    return min_depth;
-  }
-
-  uint32_t compute_levels_ASAP_eager( node const& n )
-  {
-    if ( _ntk.visited( n ) == _ntk.trav_id() )
-    {
-      return _timeframes[n].first;
-    }
-    _ntk.set_visited( n, _ntk.trav_id() );
-
-    if ( _ntk.is_constant( n ) || _ntk.is_pi( n ) )
+    /* update po_level, if possible */
+    if ( !_ps.assume.balance_cios )
     {
-      return _timeframes[n].first = 0;
-    }
-
-    uint32_t level{ 0 };
-    _ntk.foreach_fanin( n, [&]( auto const& fi ) {
-      auto const ni = _ntk.get_node( fi );
-      if ( !_ntk.is_constant( ni ) )
+      for ( auto const& v : level_assignment )
       {
-        auto fi_level = compute_levels_ASAP_eager( ni );
-        if ( _ps.assume.branch_pis || !_ntk.is_pi( ni ) )
+        if ( v[1] == _depth + 1 )
         {
-          fi_level += _ntk.fanout_size( ni ) > 1 ? 1 : 0;
+          _po_levels[v[0]] = std::max( _po_levels[v[0]], _depth + 1 - v[2] - mobility_update );
         }
-        level = std::max( level, fi_level );
-      }
-    } );
-
-    return _timeframes[n].first = level + 1;
-  }
-
-  void compute_levels_ALAP_eager( node const& n )
-  {
-    _ntk.set_visited( n, _ntk.trav_id() );
-
-    _ntk.foreach_fanin( n, [&]( auto const& fi ) {
-      auto const ni = _ntk.get_node( fi );
-      if ( !_ntk.is_constant( ni ) )
-      {
-        if ( _ps.assume.balance_pis && _ntk.is_pi( ni ) )
-        {
-          assert( _timeframes[n].second > 0 );
-          _timeframes[ni].second = 0;
-        }
-        else if ( _ps.assume.branch_pis || !_ntk.is_pi( ni ) )
+        else
         {
-          assert( _timeframes[n].second > num_splitter_levels( ni ) );
-          auto fi_level = _timeframes[n].second - ( _ntk.fanout_size( ni ) > 1 ? 2 : 1 );
-          if ( _ntk.visited( ni ) != _ntk.trav_id() || _timeframes[ni].second > fi_level )
-          {
-            _timeframes[ni].second = fi_level;
-            compute_levels_ALAP_eager( ni );
-          }
+          break;
         }
       }
-    } );
+    }
   }
-#pragma
+#pragma endregion
 
 #pragma region Dump buffered network
 public:
@@ -1139,7 +1172,6 @@ class buffer_insertion
     buffers[_ntk.get_constant( false )].emplace_back( 1, bufntk.get_constant( false ) );
     if ( _ntk.get_node( _ntk.get_constant( false ) ) != _ntk.get_node( _ntk.get_constant( true ) ) )
     {
-      std::cerr << "[w] ntk has different nodes for const0 and const1 -- poorly tested case, might be buggy.\n";
       node_to_signal[_ntk.get_constant( true )] = bufntk.get_constant( true );
       buffers[_ntk.get_constant( true )].emplace_back( 1, bufntk.get_constant( true ) );
     }
@@ -1147,28 +1179,16 @@ class buffer_insertion
     /* PIs */
     _ntk.foreach_pi( [&]( auto const& n ) {
       node_to_signal[n] = bufntk.create_pi();
+      create_buffer_chain( bufntk, buffers, n, node_to_signal[n] );
     } );
-    if ( _ps.assume.branch_pis )
-    {
-      _ntk.foreach_pi( [&]( auto const& n ) {
-        create_buffer_chain( bufntk, buffers, n, node_to_signal[n] );
-      } );
-    }
-    else
-    {
-      _ntk.foreach_pi( [&]( auto const& n ) {
-        buffers[n].emplace_back( 1, node_to_signal[n] );
-      } );
-    }
 
     /* gates: assume topological order */
     _ntk.foreach_gate( [&]( auto const& n ) {
       std::vector<buf_signal> children;
       _ntk.foreach_fanin( n, [&]( auto const& fi ) {
-        auto ni = _ntk.get_node( fi );
         buf_signal s;
-        if ( _ntk.is_constant( ni ) || ( !_ps.assume.branch_pis && _ntk.is_pi( ni ) ) )
-          s = node_to_signal[ni];
+        if ( _ntk.is_constant( _ntk.get_node( fi ) ) )
+          s = node_to_signal[fi];
         else
           s = get_buffer_at_relative_depth( bufntk, buffers[fi], _levels[n] - _levels[fi] - 1 );
         children.push_back( _ntk.is_complemented( fi ) ? !s : s );
@@ -1178,129 +1198,17 @@ class buffer_insertion
     } );
 
     /* POs */
-    if ( _ps.assume.balance_pos )
-    {
-      _ntk.foreach_po( [&]( auto const& f, uint32_t i ) {
-        auto n = _ntk.get_node( f );
-        if ( _ntk.is_constant( n ) || ( !_ps.assume.branch_pis && _ntk.is_pi( n ) ) ) // not branch => not balance
-        {
-          if ( _ntk.is_pi( n ) && _ntk.is_complemented( f ) )
-            std::cerr << "[w] an explicit inverter between non-branched PI " << n << " and PO " << i << " is neglected.\n";
-          bufntk.create_po( _ntk.is_complemented( f ) ? !node_to_signal[f] : node_to_signal[f] );
-        }
-        else if ( _ntk.fanout_size( n ) == 1 && _depth == _levels[f] )
-        {
-          if ( _ntk.is_complemented( f ) )
-            bufntk.invert( bufntk.get_node( node_to_signal[f] ) );
-          bufntk.create_po( node_to_signal[f] );
-        }
-        else
-        {
-          buf_signal s = get_buffer_or_inverter( bufntk, buffers[f], _depth - _levels[f], _ntk.is_complemented( f ) );
-          bufntk.create_po( s );
-        }
-      } );
-    }
-    else // !_ps.assume.balance_pos
-    {
-      std::set<node> checked;
-      unordered_node_map<std::list<buf_signal>, Ntk> inverted_buffers( _ntk );
-      _ntk.foreach_po( [&]( auto const& f ) {
-        auto n = _ntk.get_node( f );
-        if ( !_ntk.is_constant( n ) && !( _ntk.is_pi( n ) && !_ps.assume.branch_pis ) && _ntk.fanout_size( n ) > 1 )
-        {
-          if ( checked.find( n ) == checked.end() )
-          {
-            checked.insert( n );
-            /* count available slots in buffers[n] */
-            uint32_t slots{ 0u };
-            for ( auto const& bufs : buffers[n] )
-            {
-              slots += _ps.assume.splitter_capacity - bufntk.fanout_size( bufntk.get_node( bufs.back() ) );
-            }
-            slots -= _ps.assume.splitter_capacity - 1; // buffers[n][0] is n itself
-
-            /* add enough buffers */
-            if ( _external_ref_count[n] > _external_ref_count_neg[n] ) /* there are pos POs */
-            {
-              if ( buffers[n].size() == 1 )
-              {
-                buffers[n].emplace_back( 1, bufntk.create_buf( buffers[n][0].back() ) );
-                slots += _ps.assume.splitter_capacity - 1;
-              }
-              uint32_t needed_slots = _external_ref_count[n] - _external_ref_count_neg[n];
-              if ( _external_ref_count_neg[n] > 0 )
-                ++needed_slots;
-              while ( slots < needed_slots )
-              {
-                auto p = get_lowest_spot( bufntk, buffers[n] );
-                add_splitter( bufntk, buffers[n], p.first, p.second );
-                slots += _ps.assume.splitter_capacity - 1;
-              }
-            }
-
-            /* add inverted buffer tree */
-            if ( _external_ref_count_neg[n] > 0 )
-            {
-              auto p = get_lowest_spot( bufntk, buffers[n] );
-              buf_signal const& s = p.first;
-              uint32_t const& rd = p.second;
-              if ( _external_ref_count_neg[n] == _ntk.fanout_size( n ) )
-              {
-                bufntk.invert( bufntk.get_node( s ) );
-                buffers[n][rd].remove( s );
-                inverted_buffers[n].push_back( s );
-              }
-              else
-              {
-                inverted_buffers[n].push_back( bufntk.create_buf( !s ) );
-              }
-              uint32_t inverted_slots{ _ps.assume.splitter_capacity };
-              while ( inverted_slots < _external_ref_count_neg[n] )
-              {
-                buf_signal s = get_first_spot( bufntk, inverted_buffers[n] );
-                inverted_buffers[n].push_back( bufntk.create_buf( s ) );
-                inverted_slots += _ps.assume.splitter_capacity - 1;
-              }
-            }
-
-            /* check */
-            uint32_t nbufs = 0;
-            for ( auto l : buffers[n] )
-              nbufs += l.size();
-            assert( nbufs - 1 + inverted_buffers[n].size() == _num_buffers[n] );
-          }
-        }
-      } );
-
-      _ntk.foreach_po( [&]( auto const& f, uint32_t i ) {
-        auto n = _ntk.get_node( f );
-        if ( _ntk.is_constant( n ) || ( _ntk.is_pi( n ) && ( !_ps.assume.branch_pis || _ntk.fanout_size( n ) == 1 ) ) )
-        {
-          if ( _ntk.is_pi( n ) && _ntk.is_complemented( f ) )
-            std::cerr << "[w] an explicit inverter between non-branched PI " << n << " and PO " << i << " is neglected.\n";
-          bufntk.create_po( _ntk.is_complemented( f ) ? !node_to_signal[f] : node_to_signal[f] );
-        }
-        else if ( _ntk.fanout_size( n ) == 1 )
-        {
-          if ( _ntk.is_complemented( f ) )
-            bufntk.invert( bufntk.get_node( node_to_signal[f] ) );
-          bufntk.create_po( node_to_signal[f] );
-        }
-        else
-        {
-          buf_signal s = _ntk.is_complemented( f ) ? get_first_spot( bufntk, inverted_buffers[n] ) : get_lowest_spot( bufntk, buffers[n] ).first;
-          assert( bufntk.is_buf( bufntk.get_node( s ) ) );
-          bufntk.create_po( s );
-        }
-      } );
-    }
+    _ntk.foreach_po( [&]( auto const& f, auto i ) {
+      buf_signal s;
+      if ( _ntk.is_constant( _ntk.get_node( f ) ) )
+        s = node_to_signal[f];
+      else
+        s = get_buffer_at_relative_depth( bufntk, buffers[f], _po_levels[i] - _levels[f] - 1 );
+      assert( _ps.assume.ignore_co_negation );
+      bufntk.create_po( _ntk.is_complemented( f ) ? !s : s );
+    } );
 
-    // assert( bufntk.size() - bufntk.num_pis() - bufntk.num_gates() - 1 == num_buffers() );
-    if ( bufntk.size() - bufntk.num_pis() - bufntk.num_gates() - 1 != num_buffers() )
-    {
-      std::cerr << "[w] actual #bufs = " << ( bufntk.size() - bufntk.num_pis() - bufntk.num_gates() - 1 ) << ", counted = " << num_buffers() << "\n";
-    }
+    assert( bufntk.size() - bufntk.num_pis() - bufntk.num_gates() - 1 == num_buffers() );
   }
 
 private:
@@ -1310,70 +1218,25 @@ class buffer_insertion
     if ( _ntk.fanout_size( n ) == 0 )
       return; /* dangling */
 
-    auto const& fanout_info = _fanouts[n];
-    assert( fanout_info.size() > 0u );
-
-    if ( _external_ref_count[n] > 0u && !_ps.assume.balance_pos )
-    {
-      if ( _ntk.fanout_size( n ) == _external_ref_count[n] )
-      {
-        if ( _external_ref_count[n] > _external_ref_count_neg[n] )
-          buffers[n].resize( std::ceil( std::log( _external_ref_count[n] - _external_ref_count_neg[n] ) / std::log( _ps.assume.splitter_capacity ) ) + 1 );
-        else
-          buffers[n].resize( _external_ref_count_neg[n] > 1 ? 2 : 1 );
-      }
-      else
-      {
-        auto it = fanout_info.rbegin();
-        while ( it->fanouts.size() == 0u )
-          ++it;
-        buffers[n].resize( it->relative_depth );
-      }
-    }
-    else
-    {
-      buffers[n].resize( fanout_info.back().relative_depth );
-    }
+    assert( _fanouts[n].size() > 0u );
+    buffers[n].resize( _fanouts[n].back().relative_depth );
     auto& fot = buffers[n];
-
-    typename BufNtk::signal fi = s;
-    fot[0].push_back( fi );
+    fot[0].push_back( s );
     for ( auto i = 1u; i < fot.size(); ++i )
     {
-      fi = bufntk.create_buf( fi );
-      fot[i].push_back( fi );
+      fot[i].push_back( bufntk.create_buf( fot[i-1].back() ) );
     }
   }
 
-  template<class BufNtk, typename FOT>
-  typename BufNtk::signal get_buffer_or_inverter( BufNtk& bufntk, FOT& fot, uint32_t rd, bool inverted ) const
-  {
-    assert( rd == fot.size() - 1 ); // must be at the highest level
-    for ( auto it = fot[rd].begin(); it != fot[rd].end(); ++it )
-    {
-      auto b = bufntk.get_node( *it );
-      if ( bufntk.fanout_size( b ) < _ps.assume.splitter_capacity )
-      {
-        if ( bufntk.is_not( b ) != inverted )
-        {
-          if ( bufntk.fanout_size( b ) == 0 )
-            bufntk.invert( b );
-          else
-            continue;
-        }
-        return *it;
-      }
-    }
-    typename BufNtk::signal b_lower = get_buffer_at_relative_depth( bufntk, fot, rd - 1 );
-    typename BufNtk::signal b = bufntk.create_buf( inverted ? !b_lower : b_lower );
-    fot[rd].push_back( b );
-    return b;
-  }
-
   template<class BufNtk, typename FOT>
   typename BufNtk::signal get_buffer_at_relative_depth( BufNtk& bufntk, FOT& fot, uint32_t rd ) const
   {
     typename BufNtk::signal b = fot[rd].back();
+    if ( rd == 0 && bufntk.is_pi( bufntk.get_node( b ) ) )
+    {
+      assert( bufntk.fanout_size( bufntk.get_node( b ) ) < _ps.assume.ci_capacity );
+      return b;
+    }
     if ( bufntk.fanout_size( bufntk.get_node( b ) ) == _ps.assume.splitter_capacity )
     {
       assert( rd > 0 );
@@ -1383,68 +1246,92 @@ class buffer_insertion
     }
     return b;
   }
+#pragma endregion
 
-  template<class BufNtk, typename FOT>
-  std::pair<typename BufNtk::signal, uint32_t> get_lowest_spot( BufNtk& bufntk, FOT& fot ) const
+#pragma region Post-dump optimization
+public:
+template<class BufNtk>
+uint32_t remove_buffer_chains( BufNtk& ntk ) const
+{
+  static_assert( is_buffered_network_type_v<BufNtk>, "BufNtk is not a buffered network" );
+
+  uint32_t max_chain = 0;
+  ntk.incr_trav_id();
+  ntk.foreach_po( [&]( auto f ){
+    remove_buffer_chains_rec( ntk, ntk.get_node( f ), 0, max_chain );
+  } );
+  return max_chain;
+}
+
+private:
+template<class BufNtk>
+std::pair<uint32_t, typename BufNtk::node> remove_buffer_chains_rec( BufNtk& ntk, typename BufNtk::node n, typename BufNtk::node parent, uint32_t& max_chain ) const
+{
+  if ( ntk.visited( n ) == ntk.trav_id() )
+    return std::make_pair( 0, n );
+  ntk.set_visited( n, ntk.trav_id() );
+  if ( ntk.is_pi( n ) )
+    return std::make_pair( 0, n );
+
+  if ( ntk.is_buf( n ) )
   {
-    for ( auto rd = 1u; rd < fot.size(); ++rd )
+    // splitter
+    if ( ntk.fanout_size( n ) > 1 )
     {
-      for ( auto it = fot[rd].begin(); it != fot[rd].end(); ++it )
+      ntk.foreach_fanin( n, [&]( auto f ){
+        remove_buffer_chains_rec( ntk, ntk.get_node( f ), n, max_chain );
+      } );
+      return std::make_pair( 0, n );
+    }
+
+    // single-output buffer: can be part of a chain to be removed
+    std::pair<uint32_t, typename BufNtk::node> ret;
+    ntk.foreach_fanin( n, [&]( auto f ){
+      auto [count, origin] = remove_buffer_chains_rec( ntk, ntk.get_node( f ), n, max_chain );
+      if ( count % _ps.assume.num_phases == _ps.assume.num_phases - 1 )
       {
-        typename BufNtk::signal& b = *it;
-        if ( bufntk.fanout_size( bufntk.get_node( b ) ) < _ps.assume.splitter_capacity )
+        // TODO: take care of complementation
+        if ( parent != 0 )
+        {
+          ntk.replace_in_node( parent, n, ntk.make_signal( origin ) );
+          ntk.take_out_node( n );
+        }
+        else
         {
-          return { b, rd };
+          ntk.replace_in_outputs( n, ntk.make_signal( origin ) );
+          ntk.take_out_node( n );
         }
+        max_chain = std::max( count + 1, max_chain );
       }
-    }
-    assert( false );
-  }
-
-  template<class BufNtk, typename BufSig = typename BufNtk::signal>
-  BufSig get_first_spot( BufNtk const& bufntk, std::list<BufSig> const& bufs ) const
-  {
-    auto it = bufs.begin();
-    while ( it != bufs.end() )
-    {
-      if ( bufntk.fanout_size( bufntk.get_node( *it ) ) < _ps.assume.splitter_capacity )
-        return *it;
-      ++it;
-    }
-    assert( false );
+      ret = std::make_pair( count + 1, origin );
+    } );
+    return ret;
   }
 
-  template<class BufNtk, typename FOT>
-  void add_splitter( BufNtk& bufntk, FOT& fot, typename BufNtk::signal b, uint32_t rd ) const
-  {
-    if ( rd == fot.size() - 1 )
-      fot.emplace_back( 1, bufntk.create_buf( b ) );
-    else
-      fot[rd + 1].push_back( bufntk.create_buf( b ) );
-  }
+  // gate
+  ntk.foreach_fanin( n, [&]( auto f ){
+    remove_buffer_chains_rec( ntk, ntk.get_node( f ), n, max_chain );
+  } );
+  return std::make_pair( 0, n );
+}
 #pragma endregion
 
 public:
-  /*! \brief Optimize with chunked movement using the specified optimization policy.
-   *
-   * For more information, please refer to [1].
-   *
-   * [1] Irredundant Buffer and Splitter Insertion and Scheduling-Based Optimization for AQFP Circuits.
-   * Siang-Yun Lee et. al. IWLS 2021. */
+  /*! \brief Optimize with chunked movement using the specified optimization policy. */
   void optimize()
   {
     if ( _ps.optimization_effort == buffer_insertion_params::none )
     {
       return;
     }
-    else if ( _ps.optimization_effort == buffer_insertion_params::optimal )
-    {
-      if constexpr ( has_get_network_name_v<Ntk> )
-        optimize_with_smt( _ntk.get_network_name() );
-      else
-        optimize_with_smt( "" );
-      return;
-    }
+    //else if ( _ps.optimization_effort == buffer_insertion_params::optimal )
+    //{
+    //  if constexpr ( has_get_network_name_v<Ntk> )
+    //    optimize_with_smt( _ntk.get_network_name() );
+    //  else
+    //    optimize_with_smt( "" );
+    //  return;
+    //}
 
     if ( _outdated )
     {
@@ -1456,8 +1343,7 @@ class buffer_insertion
     {
       updated = find_and_move_chunks();
     } while ( updated && _ps.optimization_effort == buffer_insertion_params::until_sat );
-
-    adjust_depth();
+    single_gate_movement();
   }
 
 #pragma region Chunked movement
@@ -1468,26 +1354,31 @@ class buffer_insertion
     node o; // outside node
   };
 
+  struct po_interface
+  {
+    node c; // chunk node
+    uint32_t o; // PO index
+  };
+
   struct chunk
   {
     uint32_t id;
     std::vector<node> members{};
     std::vector<io_interface> input_interfaces{};
     std::vector<io_interface> output_interfaces{};
+    std::vector<po_interface> po_interfaces{};
     int32_t slack{ std::numeric_limits<int32_t>::max() };
     int32_t benefits{ 0 };
   };
 
   bool is_ignored( node const& n ) const
   {
-    return _ntk.is_constant( n ) || ( !_ps.assume.branch_pis && _ntk.is_pi( n ) );
+    return _ntk.is_constant( n );
   }
 
   bool is_fixed( node const& n ) const
   {
-    if ( _ps.assume.balance_pis )
-      return _ntk.is_pi( n );
-    return false;
+    return _ps.assume.balance_cios && _ps.assume.ci_phases.size() == 1 && _ntk.is_pi( n );
   }
 
   bool find_and_move_chunks()
@@ -1520,16 +1411,42 @@ class buffer_insertion
     } );
 
     count_buffers();
-    // assert( num_buffers() <= num_buffers_before );
+    assert( num_buffers() <= num_buffers_before );
     return updated && num_buffers() < num_buffers_before;
   }
 
+  void single_gate_movement()
+  {
+    _ntk.foreach_node( [&]( auto const& n ) {
+      if ( is_ignored( n ) || is_fixed( n ) )
+        return;
+
+      _ntk.incr_trav_id();
+      chunk c{ _ntk.trav_id() };
+      c.members.emplace_back( n );
+      _ntk.foreach_fanin( n, [&]( auto const& fi ) {
+        auto const ni = _ntk.get_node( fi );
+        if ( !is_ignored( ni )  )
+          c.input_interfaces.push_back( { n, ni } );
+      } );
+      auto const& fanout_info = _fanouts[n];
+      for ( auto it = fanout_info.begin(); it != fanout_info.end(); ++it )
+      {
+        for ( auto it2 = it->fanouts.begin(); it2 != it->fanouts.end(); ++it2 )
+          c.output_interfaces.push_back( { n, *it2 } );
+        for ( auto it2 = it->extrefs.begin(); it2 != it->extrefs.end(); ++it2 )
+          c.po_interfaces.push_back( { n, *it2 } );
+      }
+
+      if ( !analyze_chunk_down( c ) )
+        analyze_chunk_up( c );
+    } );
+  }
+
   void recruit( node const& n, chunk& c )
   {
     if ( _ntk.visited( n ) == c.id )
       return;
-    // if ( c.members.size() > _ps.max_chunk_size ) // TODO: Directly returning might be problematic
-    //   return;
 
     assert( _ntk.visited( n ) <= _start_id );
     assert( !is_fixed( n ) );
@@ -1560,43 +1477,92 @@ class buffer_insertion
   void recruit_fanouts( node const& n, chunk& c )
   {
     auto const& fanout_info = _fanouts[n];
-    if ( fanout_info.size() == 0 )
+    if ( fanout_info.size() == 0 ) /* dangling */
       return;
 
-    if ( _ntk.fanout_size( n ) == _external_ref_count[n] ) // only POs
+    auto it = fanout_info.begin();
+    if ( _ntk.fanout_size( n ) == 1 ) /* single fanout */
     {
-      c.output_interfaces.push_back( { n, n } ); // PO interface
+      assert( fanout_info.size() == 1 );
+      if ( it->fanouts.size() == 1 ) /* single gate fanout */
+      {
+        if ( it->relative_depth == 1 )
+          recruit( it->fanouts.front(), c );
+        else
+          c.output_interfaces.push_back( { n, it->fanouts.front() } );
+      }
+      else /* single PO fanout */
+      {
+        assert( it->extrefs.size() == 1 );
+        c.po_interfaces.push_back( { n, it->extrefs.front() } );
+      }
+      return;
     }
-    else if ( fanout_info.size() == 1 ) // single gate fanout
+
+    for ( ; it != fanout_info.end(); ++it )
     {
-      auto const& no = fanout_info.front().fanouts.front();
-      if ( is_fixed( no ) )
-        c.output_interfaces.push_back( { n, no } );
-      else if ( fanout_info.front().relative_depth == 1 )
-        recruit( no, c );
-      else
-        c.output_interfaces.push_back( { n, no } );
+      for ( auto it2 = it->extrefs.begin(); it2 != it->extrefs.end(); ++it2 )
+        c.po_interfaces.push_back( { n, *it2 } );
     }
-    else
+    it = fanout_info.begin();
+
+    if ( _ps.assume.ci_capacity > 1 && _ntk.is_pi( n ) )
     {
-      for ( auto it = fanout_info.begin(); it != fanout_info.end(); ++it )
+      if ( it->relative_depth == 1 )
+      {
+        for ( auto it2 = it->fanouts.begin(); it2 != it->fanouts.end(); ++it2 )
+          recruit( *it2, c );
+        it++;
+      }
+      if ( it->relative_depth == 2 && fanout_info.front().num_edges == _ps.assume.ci_capacity )
+      {
+        assert( fanout_info.front().relative_depth == 1 );
+        for ( auto it2 = it->fanouts.begin(); it2 != it->fanouts.end(); ++it2 )
+          recruit( *it2, c );
+        it++;
+      }
+      for ( ; it != fanout_info.end(); ++it )
       {
         for ( auto it2 = it->fanouts.begin(); it2 != it->fanouts.end(); ++it2 )
         {
-          if ( is_fixed( *it2 ) )
-            c.output_interfaces.push_back( { n, *it2 } );
-          else if ( it->relative_depth == 2 )
-            recruit( *it2, c );
-          else if ( _ntk.visited( *it2 ) != c.id )
+          if ( _ntk.visited( *it2 ) != c.id )
             c.output_interfaces.push_back( { n, *it2 } );
         }
       }
+      return;
+    }
+
+    for ( ; it != fanout_info.end(); ++it )
+    {
+      for ( auto it2 = it->fanouts.begin(); it2 != it->fanouts.end(); ++it2 )
+      {
+        if ( it->relative_depth == 2 )
+          recruit( *it2, c );
+        else if ( _ntk.visited( *it2 ) != c.id )
+          c.output_interfaces.push_back( { n, *it2 } );
+      }
     }
   }
 
   bool are_close( node const& ni, node const& n )
   {
     auto const& fanout_info = _fanouts[ni];
+
+    if ( _ps.assume.ci_capacity > 1 && _ntk.is_pi( ni ) )
+    {
+      auto const& front_fanouts = fanout_info.front().fanouts;
+      if ( fanout_info.front().relative_depth == 1 )
+      {
+        if ( std::find( front_fanouts.begin(), front_fanouts.end(), n ) != front_fanouts.end() )
+          return true;
+        if ( fanout_info.front().num_edges < _ps.assume.ci_capacity )
+          return false;
+      }
+      else if ( _ntk.fanout_size( ni ) <= _ps.assume.ci_capacity )
+        return false;
+      assert( fanout_info.size() > 1 );
+    }
+
     if ( fanout_info.size() == 1 && fanout_info.front().relative_depth == 1 )
     {
       assert( fanout_info.front().fanouts.front() == n );
@@ -1629,7 +1595,7 @@ class buffer_insertion
     }
     for ( int i = 0; i < c.output_interfaces.size(); ++i )
     {
-      if ( _ntk.visited( c.output_interfaces[i].o ) == c.id && c.output_interfaces[i].o != c.output_interfaces[i].c )
+      if ( _ntk.visited( c.output_interfaces[i].o ) == c.id )
       {
         c.output_interfaces.erase( c.output_interfaces.begin() + i );
         --i;
@@ -1639,6 +1605,9 @@ class buffer_insertion
 
   bool analyze_chunk_down( chunk c )
   {
+    count_buffers();
+    auto buffers_before = num_buffers();
+
     std::set<node> marked_oi;
     for ( auto oi : c.output_interfaces )
     {
@@ -1659,25 +1628,69 @@ class buffer_insertion
         break;
       }
       c.slack = std::min( c.slack, int32_t( rd - lowest ) );
-      if ( c.slack == rd - lowest )
-        mark_occupied( ii.o, lowest );                                                          // TODO: may be inaccurate
-      if ( _fanouts[ii.o].back().relative_depth == rd && _fanouts[ii.o].back().num_edges == 1 ) // is the only highest fanout
+      pseudo_move( ii.o, ii.c, rd, lowest );
+      if ( _fanouts[ii.o].back().relative_depth == rd && _fanouts[ii.o].back().num_edges == 0 ) // `ii.c` is the last highest fanout of `ii.o`
       {
         ++c.benefits;
       }
     }
 
+    if ( c.po_interfaces.size() > 0 )
+    {
+      if ( !_ps.assume.balance_cios && c.slack >= _ps.assume.num_phases )
+      {
+        c.slack -= c.slack % _ps.assume.num_phases;
+      }
+      else
+      {
+        for ( auto poi : c.po_interfaces )
+        {
+          if ( marked_oi.find( poi.c ) == marked_oi.end() )
+            --c.benefits;
+        }
+      }
+    }
+
+    std::vector<node> pi_members;
     for ( auto m : c.members )
-      c.slack = std::min( c.slack, int32_t( _ntk.is_pi( m ) ? _levels[m] : _levels[m] - 1 ) );
+    {
+      if ( _ntk.is_pi( m ) )
+      {
+        pi_members.emplace_back( m );
+        c.slack = std::min( c.slack, int32_t( _levels[m] ) );
+      }
+    }
+    if ( pi_members.size() > 0 )
+    {
+      while ( c.slack > 0 )
+      {
+        bool ok = true;
+        for ( auto m : pi_members )
+        {
+          if ( _levels[m] < c.slack || !is_acceptable_ci_lvl( _levels[m] - c.slack ) )
+          {
+            ok = false;
+            break;
+          }
+        }
+        if ( !ok )
+          --c.slack;
+        else
+          break;
+      }
+    }
 
     if ( c.benefits > 0 && c.slack > 0 )
     {
-      count_buffers();
       bool legal = true;
-      auto buffers_before = num_buffers();
 
       for ( auto m : c.members )
         _levels[m] -= c.slack;
+      if ( !_ps.assume.balance_cios && c.slack >= _ps.assume.num_phases )
+      {
+        for ( auto poi : c.po_interfaces )
+          _po_levels[poi.o] -= c.slack;
+      }
       for ( auto m : c.members )
         update_fanout_info( m );
       for ( auto ii : c.input_interfaces )
@@ -1691,6 +1704,11 @@ class buffer_insertion
         /* UNDO */
         for ( auto m : c.members )
           _levels[m] += c.slack;
+        if ( !_ps.assume.balance_cios && c.slack >= _ps.assume.num_phases )
+        {
+          for ( auto poi : c.po_interfaces )
+            _po_levels[poi.o] += c.slack;
+        }
         for ( auto m : c.members )
           update_fanout_info( m );
         for ( auto ii : c.input_interfaces )
@@ -1704,9 +1722,10 @@ class buffer_insertion
     }
     else
     {
-      /* reset fanout_infos of input_interfaces because num_edges may be modified by mark_occupied */
+      /* reset fanout_infos of input_interfaces because num_edges may be modified by pseudo_move */
       for ( auto ii : c.input_interfaces )
         update_fanout_info( ii.o );
+      _outdated = true;
       return false;
     }
   }
@@ -1716,33 +1735,80 @@ class buffer_insertion
   {
     auto const& fanout_info = _fanouts[n];
     assert( fanout_info.size() );
-    assert( _ntk.fanout_size( n ) != _external_ref_count[n] );
-    if ( fanout_info.size() == 1 )
+
+    auto it = fanout_info.begin();
+    uint32_t rd_prev = 1;
+    uint32_t num_splitters_prev = 1;
+    if ( _ntk.is_pi( n ) && _ps.assume.ci_capacity > 1 )
+    {
+      if ( it->num_edges <= _ps.assume.ci_capacity )
+        return 1;
+      else
+        num_splitters_prev = _ps.assume.ci_capacity - it->fanouts.size() - it->extrefs.size();
+    }
+    else if ( fanout_info.size() == 1 ) // single fanout
     {
-      assert( fanout_info.front().fanouts.size() == 1 );
       return 1;
     }
-    auto it = fanout_info.begin();
-    ++it;
-    while ( it != fanout_info.end() && it->num_edges == _ps.assume.splitter_capacity )
-      ++it;
-    if ( it == fanout_info.end() ) // full fanout tree
-      return fanout_info.back().relative_depth + 1;
-    --it; // the last full layer
-    return it->relative_depth + 1;
+    
+    ++it; // skip the first splitter at rd=1
+    for ( ; it != fanout_info.end(); ++it )
+    {
+      if ( it->relative_depth > rd_prev + 1 ) // level skip => must not full
+      {
+        return rd_prev + 1;
+      }
+      else if ( it->num_edges == _ps.assume.splitter_capacity * num_splitters_prev ) // full layer
+      {
+        num_splitters_prev = it->num_edges - it->fanouts.size() - it->extrefs.size();
+        rd_prev = it->relative_depth;
+      }
+      else
+      {
+        return it->relative_depth;
+      }
+    }
+    // all full
+    return fanout_info.back().relative_depth + 1;
   }
 
-  void mark_occupied( node const& n, uint32_t rd )
+  /* move `no`, which is a fanout of `n`, from `from_rd` to `to_rd` */
+  void pseudo_move( node const& n, node const& no, uint32_t from_rd, uint32_t to_rd )
   {
+    assert( from_rd > to_rd );
     auto& fanout_info = _fanouts[n];
-    for ( auto it = fanout_info.begin(); it != fanout_info.end(); ++it )
+    auto it = fanout_info.begin();
+    for ( ; it != fanout_info.end(); ++it )
     {
-      if ( it->relative_depth == rd )
+      if ( it->relative_depth == to_rd )
       {
         ++it->num_edges;
-        return;
+        it->fanouts.push_back( no );
+        break;
+      }
+      else if ( it->relative_depth > to_rd )
+      {
+        fanout_info.insert( it, {to_rd, {no}, {}, 2} );
+        break;
+      }
+    }
+    for ( ; it != fanout_info.end(); ++it )
+    {
+      if ( it->relative_depth == from_rd )
+      {
+        --it->num_edges;
+        for ( auto it2 = it->fanouts.begin(); it2 != it->fanouts.end(); ++it2 )
+        {
+          if ( *it2 == no )
+          {
+            it->fanouts.erase( it2 );
+            return;
+          }
+        }
+        assert( false );
       }
     }
+    assert( false );
   }
 
   bool analyze_chunk_up( chunk c )
@@ -1762,14 +1828,66 @@ class buffer_insertion
         ++c.benefits;
       }
       auto const& fanout_info = _fanouts[oi.c];
-      if ( _ntk.fanout_size( oi.c ) == _external_ref_count[oi.c] ) // only POs
-        c.slack = std::min( c.slack, int32_t( _depth - _levels[oi.c] - num_splitter_levels( oi.c ) ) );
-      else if ( fanout_info.size() == 1 ) // single fanout
+      if ( fanout_info.size() == 1 ) /* single fanout */
         c.slack = std::min( c.slack, int32_t( fanout_info.front().relative_depth - 1 ) );
       else
         c.slack = std::min( c.slack, int32_t( _levels[oi.o] - _levels[oi.c] - 2 ) );
     }
 
+    std::vector<uint32_t> po_to_move;
+    if ( c.po_interfaces.size() > 0 )
+    {
+      for ( auto poi : c.po_interfaces )
+      {
+        if ( _levels[poi.c] + num_splitter_levels( poi.c ) + c.slack >= _po_levels[poi.o] )
+        {
+          if ( _ps.assume.balance_cios )
+            c.slack = std::min( c.slack, int32_t( _po_levels[poi.o] - _levels[poi.c] - num_splitter_levels( poi.c ) - 1 ) );
+          else
+          {
+            c.slack = std::min( c.slack, int32_t( _depth + 1 - _po_levels[poi.o] ) );
+            po_to_move.emplace_back( poi.o );
+          }
+        }
+        else
+        {
+          if ( marked_oi.find( poi.c ) == marked_oi.end() )
+            ++c.benefits;
+        }
+      }
+    }
+
+    if ( c.benefits <= 0 || c.slack <= 0 )
+      return false;
+
+    std::vector<node> pi_members;
+    for ( auto m : c.members )
+    {
+      if ( _ntk.is_pi( m ) )
+        pi_members.emplace_back( m );
+    }
+    if ( pi_members.size() > 0 )
+    {
+      while ( c.slack > 0 )
+      {
+        bool ok = true;
+        for ( auto m : pi_members )
+        {
+          if ( !is_acceptable_ci_lvl( _levels[m] + c.slack ) )
+          {
+            ok = false;
+            break;
+          }
+        }
+        if ( !ok )
+          --c.slack;
+        else
+          break;
+      }
+    }
+    if ( po_to_move.size() > 0 )
+      c.slack -= c.slack % _ps.assume.num_phases;
+
     if ( c.benefits > 0 && c.slack > 0 )
     {
       count_buffers();
@@ -1778,17 +1896,12 @@ class buffer_insertion
 
       for ( auto m : c.members )
         _levels[m] += c.slack;
+      for ( auto po : po_to_move )
+        _po_levels[po] += c.slack;
       for ( auto m : c.members )
-      {
         legal &= update_fanout_info<true>( m );
-        if ( !legal )
-          break;
-      }
-      if ( legal )
-      {
-        for ( auto ii : c.input_interfaces )
-          update_fanout_info( ii.o );
-      }
+      for ( auto ii : c.input_interfaces )
+        legal &= update_fanout_info<true>( ii.o );
 
       _outdated = true;
       if ( legal )
@@ -1798,6 +1911,8 @@ class buffer_insertion
         /* UNDO */
         for ( auto m : c.members )
           _levels[m] -= c.slack;
+        for ( auto po : po_to_move )
+          _po_levels[po] -= c.slack;
         for ( auto m : c.members )
           update_fanout_info( m );
         for ( auto ii : c.input_interfaces )
@@ -1814,53 +1929,11 @@ class buffer_insertion
       return false;
     }
   }
-
-  void adjust_depth()
-  {
-    if ( !_ps.assume.balance_pis )
-    {
-      auto min_level = std::numeric_limits<uint32_t>::max();
-      if ( _ps.assume.branch_pis )
-      {
-        _ntk.foreach_pi( [&]( auto n ) {
-          min_level = std::min( min_level, _levels[n] );
-        } );
-
-        if ( min_level != 0 )
-        {
-          _ntk.foreach_node( [&]( auto n ) {
-            if ( !_ntk.is_constant( n ) )
-              _levels[n] -= min_level;
-          } );
-        }
-      }
-      else
-      {
-        _ntk.foreach_gate( [&]( auto n ) {
-          min_level = std::min( min_level, _levels[n] );
-        } );
-
-        if ( min_level > 1 )
-        {
-          _ntk.foreach_gate( [&]( auto n ) {
-            _levels[n] -= min_level - 1;
-          } );
-        }
-      }
-    }
-
-    _depth = 0;
-    _ntk.foreach_po( [&]( auto f ) {
-      _depth = std::max( _depth, _levels[_ntk.get_node( f )] + num_splitter_levels( _ntk.get_node( f ) ) );
-    } );
-
-    _outdated = true;
-  }
 #pragma endregion
 
 #pragma region Global optimal by SMT
 private:
-#include "optimal_buffer_insertion.hpp"
+//#include "optimal_buffer_insertion.hpp"
 #pragma endregion
 
 private:
@@ -1868,6 +1941,7 @@ class buffer_insertion
   {
     uint32_t relative_depth{ 0u };
     std::list<node> fanouts;
+    std::list<uint32_t> extrefs; // IDs of POs (as in `_ntk.foreach_po`)
     uint32_t num_edges{ 0u };
   };
   using fanouts_by_level = std::list<fanout_information>;
@@ -1877,25 +1951,25 @@ class buffer_insertion
   bool _outdated{ true };
   bool _is_scheduled_ASAP{ true };
 
+  /* The following data structures uniquely define the state (i.e. schedule) of the algorithm/flow.
+     The rest (`_fanouts` and `_num_buffers`) are computed from these by calling `count_buffers()`. */
   node_map<uint32_t, Ntk> _levels;
-  node_map<std::pair<uint32_t, uint32_t>, Ntk> _timeframes;
+  std::vector<uint32_t> _po_levels; // imaginary node, must be at `num_phases * k + 1`
   uint32_t _depth{ 0u };
 
   /* Guarantees on `_fanouts` (when not `_outdated`):
-   * - If not `branch_pis`: `_fanouts[PI]` is empty.
-   * - PO ref count is added to `num_edges` of the last element.
+   * - Sum of `_fanouts[n][l].fanouts.length() + _fanouts[n][l].extrefs.length()` over all `l`s
+   *   should be equal to `ntk.fanout_size( n )`.
    * - If having only one fanout: `_fanouts[n].size() == 1`.
    * - If having multiple fanouts: `_fanouts[n]` must have at least two elements,
    *   and the first element must have `relative_depth == 1` and `num_edges == 1`.
+   * - If `ci_capacity > 1`, `_fanouts[PI].size()` may be 1.
    */
   node_map<fanouts_by_level, Ntk> _fanouts;
-  node_map<uint32_t, Ntk> _external_ref_count;     // total refs
-  node_map<uint32_t, Ntk> _external_ref_count_neg; // negated refs
   node_map<uint32_t, Ntk> _num_buffers;
-  node_map<uint32_t, Ntk> _min_level;
-  node_map<uint32_t, Ntk> _max_level;
 
+  node_map<std::pair<uint32_t, uint32_t>, Ntk> _timeframes; // only for SMT; the most extreme min/max
   uint32_t _start_id; // for chunked movement
-};                    /* buffer_insertion */
+}; /* buffer_insertion */
 
-} // namespace mockturtle
+} // namespace mockturtle
\ No newline at end of file
diff --git a/include/mockturtle/algorithms/aqfp/buffer_verification.hpp b/include/mockturtle/algorithms/aqfp/buffer_verification.hpp
index 3e9bcb1cd..ca899c88c 100644
--- a/include/mockturtle/algorithms/aqfp/buffer_verification.hpp
+++ b/include/mockturtle/algorithms/aqfp/buffer_verification.hpp
@@ -61,6 +61,7 @@ uint32_t recompute_level( Ntk& ntk, typename Ntk::node const& n )
 {
   if ( ntk.visited( n ) == ntk.trav_id() )
     return ntk.level( n );
+  ntk.set_visited( n, ntk.trav_id() );
 
   uint32_t max_fi_level{ 0u };
   ntk.foreach_fanin( n, [&]( auto const& fi ) {
@@ -72,76 +73,31 @@ uint32_t recompute_level( Ntk& ntk, typename Ntk::node const& n )
 
 } // namespace detail
 
-/*! \brief Find a reasonable level assignment for a buffered network.
+/*! \brief Find a reasonable level assignment for a buffered network given PI levels.
  *
  * \param ntk Buffered network
- * \param ps AQFP constraints
+ * \param pi_levels Levels of PIs
  * \return Level assignment to all nodes
  */
 template<class Ntk>
-node_map<uint32_t, Ntk> schedule_buffered_network( Ntk const& ntk, aqfp_assumptions const& ps )
+node_map<uint32_t, Ntk> schedule_buffered_network_with_PI_levels( Ntk const& ntk, std::vector<uint32_t> const& pi_levels )
 {
+  assert( pi_levels.size() == ntk.num_pis() );
+
   using node = typename Ntk::node;
   node_map<uint32_t, Ntk> levels( ntk );
   depth_view dv{ ntk };
 
-  /* PIs are balanced : simple ASAP
-     POs are balanced : ALAP == ASAP and then lift all POs' TFI cone
-     neither : start from higher PO's TFI cone */
-  if ( !ps.balance_pis )
-  {
-    ntk.incr_trav_id();
-    ntk.set_visited( ntk.get_node( ntk.get_constant( false ) ), ntk.trav_id() );
-    ntk.foreach_pi( [&]( auto const& n ) {
-      ntk.set_visited( n, ntk.trav_id() );
-    } );
-
-    if ( ps.balance_pos )
-    {
-      ntk.foreach_po( [&]( auto const& f ) {
-        detail::schedule_fanin_cone( dv, ntk.get_node( f ), dv.depth() );
-      } );
-    }
-    else
-    {
-      std::list<node> pos;
-      ntk.foreach_po( [&]( auto const& f ) {
-        pos.push_back( ntk.get_node( f ) );
-      } );
+  ntk.incr_trav_id();
+  ntk.set_visited( ntk.get_node( ntk.get_constant( false ) ), ntk.trav_id() );
+  ntk.foreach_pi( [&]( auto const& n, auto i ) {
+    ntk.set_visited( n, ntk.trav_id() );
+    dv.set_level( n, pi_levels[i] );
+  } );
 
-      while ( pos.size() > 0 )
-      {
-        /* choose the highest unscheduled PO */
-        node n = pos.front();
-        uint32_t max_level = dv.level( n );
-        for ( auto it = pos.begin(); it != pos.end(); ++it )
-        {
-          if ( dv.level( *it ) > max_level )
-          {
-            n = *it;
-            max_level = dv.level( n );
-          }
-        }
-
-        detail::schedule_fanin_cone( dv, n, max_level );
-
-        for ( auto it = pos.begin(); it != pos.end(); )
-        {
-          /* remove all visited POs (there may be lower POs in the TFI of the processed PO) */
-          if ( ntk.visited( *it ) == ntk.trav_id() )
-          {
-            it = pos.erase( it );
-          }
-          /* recompute levels because some of their TFI may have been lifted */
-          else
-          {
-            detail::recompute_level( dv, *it );
-            ++it;
-          }
-        }
-      }
-    }
-  }
+  ntk.foreach_po( [&]( auto const& f ){
+    detail::recompute_level( dv, ntk.get_node( f ) );
+  });
 
   ntk.foreach_node( [&]( auto const& n ) {
     levels[n] = dv.level( n );
@@ -153,12 +109,12 @@ node_map<uint32_t, Ntk> schedule_buffered_network( Ntk const& ntk, aqfp_assumpti
 /*! \brief Verify a buffered network according to AQFP assumptions with provided level assignment.
  *
  * \param ntk Buffered network
- * \param ps AQFP constraints
+ * \param ps AQFP assumptions
  * \param levels Level assignment for all nodes
  * \return Whether `ntk` is path-balanced and properly-branched
  */
 template<class Ntk>
-bool verify_aqfp_buffer( Ntk const& ntk, aqfp_assumptions const& ps, node_map<uint32_t, Ntk> const& levels )
+bool verify_aqfp_buffer( Ntk const& ntk, aqfp_assumptions_legacy const& ps, node_map<uint32_t, Ntk> const& levels )
 {
   static_assert( is_buffered_network_type_v<Ntk>, "Ntk is not a buffered network" );
   static_assert( has_is_buf_v<Ntk>, "Ntk does not implement the is_buf method" );
@@ -214,16 +170,122 @@ bool verify_aqfp_buffer( Ntk const& ntk, aqfp_assumptions const& ps, node_map<ui
   return legal;
 }
 
-/*! \brief Verify a buffered network according to AQFP assumptions.
+/*! \brief Verify a buffered network according to AQFP assumptions with provided level assignment.
  *
  * \param ntk Buffered network
- * \param ps AQFP constraints
+ * \param ps AQFP assumptions
+ * \param levels Level assignment for all nodes
  * \return Whether `ntk` is path-balanced and properly-branched
  */
 template<class Ntk>
-bool verify_aqfp_buffer( Ntk const& ntk, aqfp_assumptions const& ps )
+bool verify_aqfp_buffer( Ntk const& ntk, aqfp_assumptions_realistic const& ps, node_map<uint32_t, Ntk> const& levels )
+{
+  static_assert( is_buffered_network_type_v<Ntk>, "Ntk is not a buffered network" );
+  static_assert( has_is_buf_v<Ntk>, "Ntk does not implement the is_buf method" );
+  bool legal = true;
+
+  /* fanout branching */
+  ntk.foreach_node( [&]( auto const& n ) {
+    if ( ntk.is_constant( n ) )
+      return;
+    if ( ntk.is_pi( n ) )
+    {
+      legal &= ( ntk.fanout_size( n ) <= ps.ci_capacity );
+    }
+    else if ( ntk.is_buf( n ) )
+    {
+      legal &= ( ntk.fanout_size( n ) <= ps.splitter_capacity );
+    }
+    else /* logic gate */
+    {
+      legal &= ( ntk.fanout_size( n ) <= 1 );
+    }
+    assert( legal );
+  } );
+
+  /* path balancing */
+  ntk.foreach_node( [&]( auto const& n ) {
+    ntk.foreach_fanin( n, [&]( auto const& fi ) {
+      auto ni = ntk.get_node( fi );
+      if ( !ntk.is_constant( ni ) )
+        legal &= ( levels[ni] == levels[n] - 1 );
+      assert( legal );
+    } );
+  } );
+
+  if ( ps.balance_cios )
+  {
+    auto const check_pi_fn = [&]( uint32_t level ){
+      for ( auto const& p : ps.ci_phases )
+      {
+        if ( level == p )
+          return true;
+      }
+      return false;
+    };
+
+    ntk.foreach_pi( [&]( auto const& n ) {
+      legal &= check_pi_fn( levels[n] );
+      assert( legal );
+    } );
+
+    uint32_t depth{ 0u };
+    ntk.foreach_po( [&]( auto const& f ) {
+      auto n = ntk.get_node( f );
+      if ( !ntk.is_constant( n ) )
+      {
+        if ( depth == 0u )
+          depth = levels[n];
+        else
+          legal &= ( levels[n] == depth );
+        assert( legal );
+      }
+    } );
+    legal &= ( depth % ps.num_phases == 0 );
+    assert( legal );
+  }
+  else
+  {
+    auto const check_pi_fn = [&]( uint32_t level ){
+      for ( auto const& p : ps.ci_phases )
+      {
+        if ( level >= p && ( level - p ) % ps.num_phases == 0 )
+          return true;
+      }
+      return false;
+    };
+
+    ntk.foreach_pi( [&]( auto const& n ) {
+      legal &= check_pi_fn( levels[n] );
+      assert( legal );
+    } );
+
+    ntk.foreach_po( [&]( auto const& f ) {
+      auto n = ntk.get_node( f );
+      if ( !ntk.is_constant( n ) )
+      {
+        legal &= ( levels[n] % ps.num_phases == 0 );
+        assert( legal );
+      }
+    } );
+  }
+
+  // TODO: max_phase_skip
+
+  return legal;
+}
+
+/*! \brief Verify a buffered network according to AQFP assumptions with provided PI level assignment.
+ *
+ * \param ntk Buffered network
+ * \param ps AQFP assumptions
+ * \param pi_levels Levels of PIs
+ * \return Whether `ntk` is path-balanced, phase-aligned, and properly-branched
+ */
+template<class Ntk, typename Asmp = aqfp_assumptions>
+bool verify_aqfp_buffer( Ntk const& ntk, Asmp const& ps, std::vector<uint32_t> const& pi_levels )
 {
-  auto const levels = schedule_buffered_network( ntk, ps );
+  auto const levels = schedule_buffered_network_with_PI_levels( ntk, pi_levels );
   return verify_aqfp_buffer( ntk, ps, levels );
 }
 
diff --git a/include/mockturtle/algorithms/aqfp/optimal_buffer_insertion.hpp b/include/mockturtle/algorithms/aqfp/optimal_buffer_insertion.hpp
index b2cefa969..859f681fb 100644
--- a/include/mockturtle/algorithms/aqfp/optimal_buffer_insertion.hpp
+++ b/include/mockturtle/algorithms/aqfp/optimal_buffer_insertion.hpp
@@ -33,6 +33,91 @@
 // NOTE: This file is included inside the class `mockturtle::buffer_insertion`
 // It should not be included anywhere else.
 
+#pragma region Compute timeframe for SMT solving
+  /*! \brief Compute the earliest and latest possible timeframe by eager ASAP and ALAP */
+  uint32_t compute_timeframe( uint32_t max_depth )
+  {
+    // TODO: Consider max_depth % _ps.assume.num_phases == 0 constraint
+    _timeframes.reset( std::make_pair( 0, 0 ) );
+    uint32_t min_depth{ 0 };
+
+    _ntk.incr_trav_id();
+    _ntk.foreach_po( [&]( auto const& f ) {
+      auto const no = _ntk.get_node( f );
+      auto clevel = compute_levels_ASAP_eager( no ) + ( _ntk.fanout_size( no ) > 1 ? 1 : 0 );
+      min_depth = std::max( min_depth, clevel );
+    } );
+
+    _ntk.incr_trav_id();
+    _ntk.foreach_po( [&]( auto const& f ) {
+      const auto n = _ntk.get_node( f );
+      if ( !_ntk.is_constant( n ) && _ntk.visited( n ) != _ntk.trav_id() )
+      {
+        _timeframes[n].second = max_depth - ( _ntk.fanout_size( n ) > 1 ? 1 : 0 );
+        compute_levels_ALAP_eager( n );
+      }
+    } );
+
+    return min_depth;
+  }
+
+  uint32_t compute_levels_ASAP_eager( node const& n )
+  {
+    if ( _ntk.visited( n ) == _ntk.trav_id() )
+    {
+      return _timeframes[n].first;
+    }
+    _ntk.set_visited( n, _ntk.trav_id() );
+
+    if ( _ntk.is_constant( n ) )
+    {
+      return _timeframes[n].first = 0;
+    }
+    if ( _ntk.is_pi( n ) )
+    {
+      return _timeframes[n].first = _ps.assume.ci_phases[0];
+    }
+
+    uint32_t level{ 0 };
+    _ntk.foreach_fanin( n, [&]( auto const& fi ) {
+      auto const ni = _ntk.get_node( fi );
+      if ( !_ntk.is_constant( ni ) )
+      {
+        level = std::max( level, compute_levels_ASAP_eager( ni ) + ( _ntk.fanout_size( ni ) > 1 ? 1 : 0 ) );
+      }
+    } );
+
+    return _timeframes[n].first = level + 1;
+  }
+
+  void compute_levels_ALAP_eager( node const& n )
+  {
+    _ntk.set_visited( n, _ntk.trav_id() );
+
+    _ntk.foreach_fanin( n, [&]( auto const& fi ) {
+      auto const ni = _ntk.get_node( fi );
+      if ( !_ntk.is_constant( ni ) )
+      {
+        if ( _ps.assume.balance_cios && _ntk.is_pi( ni ) )
+        {
+          assert( _timeframes[n].second > _ps.assume.ci_phases[0] );
+          _timeframes[ni].second = _ps.assume.ci_phases[0];
+        }
+        else
+        {
+          assert( _timeframes[n].second > num_splitter_levels( ni ) );
+          auto fi_level = _timeframes[n].second - ( _ntk.fanout_size( ni ) > 1 ? 2 : 1 );
+          if ( _ntk.visited( ni ) != _ntk.trav_id() || _timeframes[ni].second > fi_level )
+          {
+            _timeframes[ni].second = fi_level;
+            compute_levels_ALAP_eager( ni );
+          }
+        }
+      }
+    } );
+  }
+#pragma
+
 #if __GNUC__ == 7
 
 void optimize_with_smt( std::string name = "" )
diff --git a/include/mockturtle/io/write_dot.hpp b/include/mockturtle/io/write_dot.hpp
index 284db76a1..edbed2914 100644
--- a/include/mockturtle/io/write_dot.hpp
+++ b/include/mockturtle/io/write_dot.hpp
@@ -75,6 +75,13 @@ class default_dot_drawer
     }
     else
     {
+      if constexpr ( has_is_buf_v<Ntk> )
+      {
+        if ( ntk.is_buf( n ) )
+        {
+          return "box";
+        }
+      }
       return "ellipse";
     }
   }
@@ -98,6 +105,16 @@ class default_dot_drawer
 
   virtual std::string node_fillcolor( Ntk const& ntk, node<Ntk> const& n ) const
   {
+    if constexpr ( has_is_buf_v<Ntk> )
+    {
+      if ( ntk.is_buf( n ) )
+      {
+        if ( ntk.fanout_size( n ) > 1 )
+          return "lightcoral";
+        else
+          return "lightskyblue";
+      }
+    }
     return ( ntk.is_constant( n ) || ntk.is_ci( n ) ) ? "snow2" : "white";
   }
 
@@ -113,6 +130,11 @@ class default_dot_drawer
     (void)ntk;
     (void)n;
     (void)f;
+    if constexpr ( is_buffered_network_type_v<Ntk> )
+    {
+      if ( ntk.is_constant( ntk.get_node( f ) ) )
+        return false;
+    }
     return true;
   }
 
diff --git a/include/mockturtle/mockturtle.hpp b/include/mockturtle/mockturtle.hpp
index 3142be58b..030bfdc40 100644
--- a/include/mockturtle/mockturtle.hpp
+++ b/include/mockturtle/mockturtle.hpp
@@ -173,6 +173,7 @@
 #include "mockturtle/networks/tig.hpp"
 #include "mockturtle/networks/xag.hpp"
 #include "mockturtle/networks/xmg.hpp"
+#include "mockturtle/networks/crossed.hpp"
 #include "mockturtle/properties/aqfpcost.hpp"
 #include "mockturtle/properties/mccost.hpp"
 #include "mockturtle/properties/migcost.hpp"
@@ -214,3 +215,4 @@
 #include "mockturtle/views/names_view.hpp"
 #include "mockturtle/views/topo_view.hpp"
 #include "mockturtle/views/window_view.hpp"
+#include "mockturtle/views/rank_view.hpp"
diff --git a/include/mockturtle/networks/buffered.hpp b/include/mockturtle/networks/buffered.hpp
index 628e6d9f5..eef214f76 100644
--- a/include/mockturtle/networks/buffered.hpp
+++ b/include/mockturtle/networks/buffered.hpp
@@ -345,9 +345,93 @@ class buffered_mig_network : public mig_network
 
 #pragma region Restructuring
   // disable restructuring
-  std::optional<std::pair<node, signal>> replace_in_node( node const& n, node const& old_node, signal new_signal ) = delete;
-  void replace_in_outputs( node const& old_node, signal const& new_signal ) = delete;
-  void take_out_node( node const& n ) = delete;
+  void replace_in_node( node const& n, node const& old_node, signal new_signal )
+  {
+    assert( is_buf( old_node ) );
+    auto& node = _storage->nodes[n];
+
+    if ( is_buf( n ) )
+    {
+      assert( node.children[0].index == old_node );
+      new_signal.complement ^= node.children[0].weight;
+      node.children[0] = new_signal;
+      node.children[1] = !new_signal;
+      _storage->nodes[new_signal.index].data[0].h1++;
+      return;
+    }
+
+    uint32_t fanin = 3u;
+    for ( auto i = 0u; i < 3u; ++i )
+    {
+      if ( node.children[i].index == old_node )
+      {
+        fanin = i;
+        new_signal.complement ^= node.children[i].weight;
+        break;
+      }
+    }
+    assert( fanin < 3 );
+    signal child2 = new_signal;
+    signal child1 = node.children[( fanin + 1 ) % 3];
+    signal child0 = node.children[( fanin + 2 ) % 3];
+    if ( child0.index > child1.index )
+    {
+      std::swap( child0, child1 );
+    }
+    if ( child1.index > child2.index )
+    {
+      std::swap( child1, child2 );
+    }
+    if ( child0.index > child1.index )
+    {
+      std::swap( child0, child1 );
+    }
+
+    _storage->hash.erase( node );
+    node.children[0] = child0;
+    node.children[1] = child1;
+    node.children[2] = child2;
+    _storage->hash[node] = n;
+
+    // update the reference counter of the new signal
+    _storage->nodes[new_signal.index].data[0].h1++;
+  }
+  void replace_in_outputs( node const& old_node, signal const& new_signal )
+  {
+    assert( !is_dead( old_node ) );
+
+    for ( auto& output : _storage->outputs )
+    {
+      if ( output.index == old_node )
+      {
+        output.index = new_signal.index;
+        output.weight ^= new_signal.complement;
+
+        if ( old_node != new_signal.index )
+        {
+          // increment fan-in of new node
+          _storage->nodes[new_signal.index].data[0].h1++;
+        }
+      }
+    }
+  }
+  void take_out_node( node const& n )
+  {
+    assert( is_buf( n ) );
+
+    auto& nobj = _storage->nodes[n];
+    nobj.data[0].h1 = UINT32_C( 0x80000000 ); /* fanout size 0, but dead */
+
+    for ( auto const& fn : _events->on_delete )
+    {
+      ( *fn )( n );
+    }
+
+    if ( decr_fanout_size( nobj.children[0].index ) == 0 )
+    {
+      take_out_node( nobj.children[0].index );
+    }
+  }
   void substitute_node( node const& old_node, signal const& new_signal ) = delete;
   void substitute_nodes( std::list<std::pair<node, signal>> substitutions ) = delete;
 #pragma endregion
diff --git a/test/algorithms/aqfp/aqfp_retiming.cpp b/test/algorithms/aqfp/aqfp_retiming.cpp
index e789438d2..28db8efe1 100644
--- a/test/algorithms/aqfp/aqfp_retiming.cpp
+++ b/test/algorithms/aqfp/aqfp_retiming.cpp
@@ -107,7 +107,7 @@ TEST_CASE( "aqfp retiming", "[aqfp_retiming]" )
   asp.balance_pos = true;
 
   buffer_insertion_params ps;
-  ps.assume = asp;
+  ps.assume = legacy_to_realistic( asp );
   ps.scheduling = buffer_insertion_params::ASAP;
   ps.optimization_effort = buffer_insertion_params::none;
 
@@ -123,5 +123,8 @@ TEST_CASE( "aqfp retiming", "[aqfp_retiming]" )
 
   CHECK( rst.buffers_pre == 57 );
   CHECK( rst.buffers_post == 49 );
-  CHECK( verify_aqfp_buffer( aqfp_ret, asp ) == true );
+  std::vector<uint32_t> pi_levels;
+  for ( auto i = 0u; i < aqfp_ret.num_pis(); ++i )
+    pi_levels.emplace_back( 0 );
+  CHECK( verify_aqfp_buffer( aqfp_ret, asp, pi_levels ) == true );
 }
diff --git a/test/algorithms/aqfp/buffer_insertion.cpp b/test/algorithms/aqfp/buffer_insertion.cpp
index 6a3333857..c12e4fd89 100644
--- a/test/algorithms/aqfp/buffer_insertion.cpp
+++ b/test/algorithms/aqfp/buffer_insertion.cpp
@@ -11,217 +11,6 @@
 
 using namespace mockturtle;
 
-TEST_CASE( "buffer_insertion simple test", "[buffer_insertion]" )
-{
-  mig_network mig;
-  auto const a = mig.create_pi();
-  auto const b = mig.create_pi();
-  auto const c = mig.create_pi();
-  auto const d = mig.create_pi();
-  auto const e = mig.create_pi();
-
-  auto const f1 = mig.create_maj( a, b, c );
-  auto const f2 = mig.create_maj( d, e, f1 );
-  auto const f3 = mig.create_maj( a, d, f1 );
-  auto const f4 = mig.create_maj( f1, f2, f3 );
-  mig.create_po( f4 );
-
-  buffer_insertion_params ps;
-  ps.assume.branch_pis = false;
-  ps.assume.balance_pis = false;
-  ps.assume.balance_pos = true;
-  ps.assume.splitter_capacity = 4u;
-  ps.scheduling = buffer_insertion_params::ASAP;
-  ps.optimization_effort = buffer_insertion_params::none;
-
-  buffer_insertion buffering( mig, ps );
-  node_map<uint32_t, mig_network> levels{ mig };
-  CHECK( buffering.dry_run( &levels ) == 2u );
-
-  CHECK( levels[f1] == 1u );
-  CHECK( levels[f2] == 3u );
-  CHECK( levels[f3] == 3u );
-  CHECK( levels[f4] == 4u );
-  CHECK( buffering.depth() == 4u );
-  CHECK( buffering.num_buffers( mig.get_node( f1 ) ) == 2u );
-  CHECK( buffering.num_buffers( mig.get_node( f2 ) ) == 0u );
-  CHECK( buffering.num_buffers( mig.get_node( f3 ) ) == 0u );
-  CHECK( buffering.num_buffers( mig.get_node( f4 ) ) == 0u );
-}
-
-TEST_CASE( "two layers of splitters", "[buffer_insertion]" )
-{
-  mig_network mig;
-  auto const a = mig.create_pi();
-  auto const b = mig.create_pi();
-  auto const c = mig.create_pi();
-  auto const d = mig.create_pi();
-  auto const e = mig.create_pi();
-  auto const f = mig.create_pi();
-  auto const g = mig.create_pi();
-  auto const h = mig.create_pi();
-  auto const i = mig.create_pi();
-  auto const j = mig.create_pi();
-
-  auto const f1 = mig.create_maj( a, b, c );
-  auto const f2 = mig.create_maj( b, c, d );
-  auto const f3 = mig.create_maj( d, e, f );
-  auto const f4 = mig.create_maj( g, h, i );
-  auto const f5 = mig.create_maj( h, i, j );
-
-  auto const f6 = mig.create_maj( f3, f4, f5 );
-  auto const f7 = mig.create_maj( a, f1, f2 );
-  auto const f8 = mig.create_maj( f2, f3, g );
-  auto const f9 = mig.create_maj( f7, f2, f8 );
-  auto const f10 = mig.create_maj( f8, f2, f5 );
-  auto const f11 = mig.create_maj( f2, f8, f6 );
-  auto const f12 = mig.create_maj( f9, f10, f11 );
-  mig.create_po( f12 );
-
-  buffer_insertion_params ps;
-  ps.assume.branch_pis = false;
-  ps.assume.balance_pis = false;
-  ps.assume.balance_pos = true;
-  ps.assume.splitter_capacity = 4u;
-  ps.scheduling = buffer_insertion_params::ASAP;
-  ps.optimization_effort = buffer_insertion_params::none;
-
-  buffer_insertion buffering( mig, ps );
-  CHECK( buffering.dry_run() == 17u );
-
-  CHECK( buffering.num_buffers( mig.get_node( f2 ) ) == 4u );
-  CHECK( buffering.num_buffers( mig.get_node( f6 ) ) == 2u );
-  CHECK( buffering.depth() == 7u );
-}
-
-TEST_CASE( "PO splitters, buffers and inverters", "[buffer_insertion]" )
-{
-  mig_network mig;
-  auto const a = mig.create_pi();
-  auto const b = mig.create_pi();
-  auto const c = mig.create_pi();
-  auto const d = mig.create_pi();
-
-  auto const f1 = mig.create_maj( a, b, c );
-  auto const f2 = mig.create_maj( f1, c, d );
-  mig.create_po( f1 );
-  mig.create_po( !f1 );
-  mig.create_po( f2 );
-  mig.create_po( f2 );
-  mig.create_po( !f2 );
-
-  buffer_insertion_params ps;
-  ps.assume.branch_pis = false;
-  ps.assume.balance_pis = false;
-  ps.assume.balance_pos = true;
-  ps.assume.splitter_capacity = 4u;
-  ps.scheduling = buffer_insertion_params::ASAP;
-  ps.optimization_effort = buffer_insertion_params::none;
-
-  buffer_insertion buffering( mig, ps );
-  CHECK( buffering.dry_run() == 8u );
-
-  CHECK( buffering.depth() == 5u );
-  CHECK( buffering.num_buffers( mig.get_node( f1 ) ) == 5u );
-  CHECK( buffering.num_buffers( mig.get_node( f2 ) ) == 3u );
-
-  buffered_mig_network bufntk;
-  buffering.dump_buffered_network( bufntk );
-  CHECK( verify_aqfp_buffer( bufntk, ps.assume ) == true );
-}
-
-TEST_CASE( "chain of fanouts", "[buffer_insertion]" )
-{
-  mig_network mig;
-  auto const a = mig.create_pi();
-  auto const b = mig.create_pi();
-  auto const c = mig.create_pi();
-  auto const d = mig.create_pi();
-  auto const e = mig.create_pi();
-  auto const f = mig.create_pi();
-  auto const g = mig.create_pi();
-  auto const h = mig.create_pi();
-  auto const i = mig.create_pi();
-
-  auto const f1 = mig.create_maj( a, b, c );
-  auto const f2 = mig.create_maj( f1, c, d );
-  auto const f3 = mig.create_maj( f1, f2, e );
-  auto const f4 = mig.create_maj( f1, f2, f );
-  auto const f5 = mig.create_maj( f1, f3, f4 );
-  auto const f6 = mig.create_maj( f1, f5, f );
-  auto const f7 = mig.create_maj( f1, f2, g );
-  auto const f8 = mig.create_maj( f1, f7, h );
-  auto const f9 = mig.create_maj( f1, f7, i );
-  mig.create_po( f1 );
-  mig.create_po( f1 );
-  mig.create_po( f1 );
-  mig.create_po( f1 );
-  mig.create_po( f1 );
-  mig.create_po( f6 );
-  mig.create_po( f8 );
-  mig.create_po( f9 );
-
-  buffer_insertion_params ps;
-  ps.assume.branch_pis = false;
-  ps.assume.balance_pis = false;
-  ps.assume.balance_pos = true;
-  ps.assume.splitter_capacity = 4u;
-  ps.scheduling = buffer_insertion_params::ASAP;
-  ps.optimization_effort = buffer_insertion_params::none;
-
-  buffer_insertion buffering( mig, ps );
-  CHECK( buffering.dry_run() == 11u );
-
-  CHECK( buffering.num_buffers( mig.get_node( f1 ) ) == 9u );
-  CHECK( buffering.depth() == 8u );
-}
-
-TEST_CASE( "branch but not balance PIs", "[buffer_insertion]" )
-{
-  mig_network mig;
-  auto const a = mig.create_pi();
-  auto const b = mig.create_pi(); // shared
-  auto const c = mig.create_pi(); // shared
-  auto const d = mig.create_pi();
-  auto const e = mig.create_pi(); // shared at higher level
-  auto const f = mig.create_pi(); // connects to two POs
-
-  auto const f1 = mig.create_maj( a, b, c );
-  auto const f2 = mig.create_maj( b, c, d );
-  auto const f3 = mig.create_and( f1, e );
-  auto const f4 = mig.create_and( f2, e );
-  mig.create_po( f3 );
-  mig.create_po( f4 );
-  mig.create_po( f );
-  mig.create_po( f );
-
-  buffer_insertion_params ps;
-  ps.assume.branch_pis = true;
-  ps.assume.balance_pis = false;
-  ps.assume.balance_pos = true;
-  ps.assume.splitter_capacity = 4u;
-  ps.scheduling = buffer_insertion_params::ALAP;
-  ps.optimization_effort = buffer_insertion_params::none;
-
-  buffer_insertion buffering( mig, ps );
-  node_map<uint32_t, mig_network> levels{ mig };
-  CHECK( buffering.dry_run( &levels ) == 4u );
-
-  CHECK( buffering.level( mig.get_node( f1 ) ) == 2u );
-  CHECK( buffering.level( mig.get_node( f2 ) ) == 2u );
-  CHECK( buffering.level( mig.get_node( f3 ) ) == 3u );
-  CHECK( buffering.level( mig.get_node( f4 ) ) == 3u );
-
-  CHECK( buffering.level( mig.get_node( a ) ) == 1u );
-  CHECK( buffering.level( mig.get_node( b ) ) == 0u );
-  CHECK( buffering.level( mig.get_node( c ) ) == 0u );
-  CHECK( buffering.level( mig.get_node( d ) ) == 1u );
-  CHECK( buffering.level( mig.get_node( e ) ) == 1u );
-  CHECK( buffering.level( mig.get_node( f ) ) == 2u );
-
-  CHECK( buffering.depth() == 3u );
-}
-
 TEST_CASE( "various assumptions", "[buffer_insertion]" )
 {
   aig_network aig;
@@ -245,135 +34,56 @@ TEST_CASE( "various assumptions", "[buffer_insertion]" )
   aig.create_po( f3 );
   aig.create_po( f4 );
 
-  aqfp_assumptions asp;
+  aqfp_assumptions_realistic asp;
   asp.splitter_capacity = 2u;
+  asp.num_phases = 1u;
+  asp.ci_phases = {0};
 
   buffer_insertion_params ps;
   ps.scheduling = buffer_insertion_params::ASAP;
   ps.optimization_effort = buffer_insertion_params::none;
 
   /* branch PI, balance PI and PO */
-  asp.branch_pis = true;
-  asp.balance_pis = true;
-  asp.balance_pos = true;
+  asp.ci_capacity = 1;
+  asp.balance_cios = true;
   ps.assume = asp;
   {
     buffer_insertion buffering( aig, ps );
     buffered_aig_network buffered;
     CHECK( buffering.run( buffered ) == 23u );
-    CHECK( verify_aqfp_buffer( buffered, asp ) == true );
-  }
-
-  /* branch PI, balance only PI */
-  asp.branch_pis = true;
-  asp.balance_pis = true;
-  asp.balance_pos = false;
-  ps.assume = asp;
-  {
-    buffer_insertion buffering( aig, ps );
-    buffered_aig_network buffered;
-    CHECK( buffering.run( buffered ) == 11u );
-    CHECK( verify_aqfp_buffer( buffered, asp ) == true );
-  }
-
-  /* branch PI, balance only PO */
-  asp.branch_pis = true;
-  asp.balance_pis = false;
-  asp.balance_pos = true;
-  ps.assume = asp;
-  {
-    ps.scheduling = buffer_insertion_params::ASAP;
-    buffer_insertion buffering1( aig, ps );
-    buffered_aig_network buffered1;
-    CHECK( buffering1.run( buffered1 ) == 23u );
-    CHECK( verify_aqfp_buffer( buffered1, asp ) == true );
-
-    ps.scheduling = buffer_insertion_params::ALAP;
-    buffer_insertion buffering2( aig, ps );
-    buffered_aig_network buffered2;
-    CHECK( buffering2.run( buffered2 ) == 11u );
-    CHECK( verify_aqfp_buffer( buffered2, asp ) == true );
-
-    ps.scheduling = buffer_insertion_params::ASAP_depth;
-    buffer_insertion buffering3( aig, ps );
-    buffered_aig_network buffered3;
-    CHECK( buffering3.run( buffered3 ) == 17u );
-    CHECK( verify_aqfp_buffer( buffered3, asp ) == true );
-
-    ps.scheduling = buffer_insertion_params::ALAP_depth;
-    buffer_insertion buffering4( aig, ps );
-    buffered_aig_network buffered4;
-    CHECK( buffering4.run( buffered4 ) == 10u );
-    CHECK( verify_aqfp_buffer( buffered4, asp ) == true );
+    CHECK( verify_aqfp_buffer( buffered, asp, buffering.pi_levels() ) == true );
   }
 
   /* branch PI, balance neither */
-  asp.branch_pis = true;
-  asp.balance_pis = false;
-  asp.balance_pos = false;
+  asp.ci_capacity = 1;
+  asp.balance_cios = false;
   ps.assume = asp;
   {
     ps.scheduling = buffer_insertion_params::ASAP;
     buffer_insertion buffering1( aig, ps );
     buffered_aig_network buffered1;
-    CHECK( buffering1.run( buffered1 ) == 11u );
-    CHECK( verify_aqfp_buffer( buffered1, asp ) == true );
+    buffering1.run( buffered1 );
+    CHECK( verify_aqfp_buffer( buffered1, asp, buffering1.pi_levels() ) == true );
 
     ps.scheduling = buffer_insertion_params::ALAP;
     buffer_insertion buffering2( aig, ps );
     buffered_aig_network buffered2;
-    CHECK( buffering2.run( buffered2 ) == 9u );
-    CHECK( verify_aqfp_buffer( buffered2, asp ) == true );
+    buffering2.run( buffered2 );
+    CHECK( verify_aqfp_buffer( buffered2, asp, buffering2.pi_levels() ) == true );
 
     ps.scheduling = buffer_insertion_params::ASAP_depth;
     buffer_insertion buffering3( aig, ps );
     buffered_aig_network buffered3;
-    CHECK( buffering3.run( buffered3 ) == 8u );
-    CHECK( verify_aqfp_buffer( buffered3, asp ) == true );
+    buffering3.run( buffered3 );
+    CHECK( buffering3.depth() == 4 );
+    CHECK( verify_aqfp_buffer( buffered3, asp, buffering3.pi_levels() ) == true );
 
     ps.scheduling = buffer_insertion_params::ALAP_depth;
     buffer_insertion buffering4( aig, ps );
     buffered_aig_network buffered4;
-    CHECK( buffering4.run( buffered4 ) == 8u );
-    CHECK( verify_aqfp_buffer( buffered4, asp ) == true );
-  }
-
-  /* don't branch PI, balance PO */
-  asp.branch_pis = false;
-  asp.balance_pis = false;
-  asp.balance_pos = true;
-  ps.assume = asp;
-  {
-    ps.scheduling = buffer_insertion_params::ASAP;
-    buffer_insertion buffering1( aig, ps );
-    buffered_aig_network buffered1;
-    CHECK( buffering1.run( buffered1 ) == 5u );
-    CHECK( verify_aqfp_buffer( buffered1, asp ) == true );
-
-    ps.scheduling = buffer_insertion_params::ASAP_depth;
-    buffer_insertion buffering2( aig, ps );
-    buffered_aig_network buffered2;
-    CHECK( buffering2.run( buffered2 ) == 5u );
-    CHECK( verify_aqfp_buffer( buffered2, asp ) == true );
-  }
-
-  /* don't branch PI, balance neither */
-  asp.branch_pis = false;
-  asp.balance_pis = false;
-  asp.balance_pos = false;
-  ps.assume = asp;
-  {
-    ps.scheduling = buffer_insertion_params::ASAP;
-    buffer_insertion buffering1( aig, ps );
-    buffered_aig_network buffered1;
-    CHECK( buffering1.run( buffered1 ) == 2u );
-    CHECK( verify_aqfp_buffer( buffered1, asp ) == true );
-
-    ps.scheduling = buffer_insertion_params::ASAP_depth;
-    buffer_insertion buffering2( aig, ps );
-    buffered_aig_network buffered2;
-    CHECK( buffering2.run( buffered2 ) == 2u );
-    CHECK( verify_aqfp_buffer( buffered2, asp ) == true );
+    buffering4.run( buffered4 );
+    CHECK( buffering4.depth() == 4 );
+    CHECK( verify_aqfp_buffer( buffered4, asp, buffering4.pi_levels() ) == true );
   }
 }
 
@@ -395,7 +105,7 @@ TEST_CASE( "optimization with chunked movement", "[buffer_insertion]" )
   auto const num_buf_asap = buffering.num_buffers();
   auto const num_buf_opt = buffering.run( buffered_ntk );
 
-  CHECK( verify_aqfp_buffer( buffered_ntk, ps.assume ) == true );
+  CHECK( verify_aqfp_buffer( buffered_ntk, ps.assume, buffering.pi_levels() ) == true );
   CHECK( num_buf_opt < num_buf_asap );
 }
 #endif