Implementing runtime parameter binding (Qiskit#1901)

Optimizes GPU simulation for single circuit with multiple parameters by binding parameters to each gates at runtime on a single circuit with multiple shots of simulations. This feature is enabled by a new option ``runtime_parameter_bind_enable=True`` (Default is ``False``). * Implementing runtime parameter binding * remove old files * fix seg fault caused by global phase for parameters * delete duplicate max_matrix_qubits * Correct metadata for runtime param bind configs and move time_taken to metadata so that we can read time info from primitives * performance improvement of sampling measure for runtime parameter binding * fix error for MPI * Improve batched sampling measure * format * fix OpenMP nested parallel * reflecting review comments * fix lint * fix lint
doichanj · Oct 6, 2023 · 67b8dcf · 67b8dcf
1 parent 13370ba
commit 67b8dcf
Show file tree

Hide file tree

Showing 43 changed files with 4,389 additions and 823 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -512,6 +512,7 @@ endif()
 
 if(AER_DEBUG)
 	set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_DEBUG)
+	set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -g")
 endif()
 
 if(TEST_JSON)

diff --git a/qiskit_aer/backends/aer_compiler.py b/qiskit_aer/backends/aer_compiler.py
@@ -491,6 +491,7 @@ def compile_circuit(circuits, basis_gates=None, optypes=None):
     "parameterizations": (list),
     "fusion_parallelization_threshold": (int, np.integer),
     "target_gpus": (list),
+    "runtime_parameter_bind_enable": (bool, np.bool_),
 }
 
 

diff --git a/qiskit_aer/backends/aer_simulator.py b/qiskit_aer/backends/aer_simulator.py
@@ -318,6 +318,12 @@ class AerSimulator(AerBackend):
     * ``accept_distributed_results`` (bool): This option enables storing
       results independently in each process (Default: None).
 
+    * ``runtime_parameter_bind_enable`` (bool): If this option is True
+      parameters are bound at runtime by using multi-shots without constructing
+      circuits for each parameters. For GPU this option can be used with
+      ``batched_shots_gpu`` to run with multiple parameters in a batch.
+      (Default: False).
+
     These backend options only apply when using the ``"statevector"``
     simulation method:
 
@@ -765,6 +771,8 @@ def _default_options(cls):
             # tensor network options
             tensor_network_num_sampling_qubits=10,
             use_cuTensorNet_autotuning=False,
+            # parameter binding
+            runtime_parameter_bind_enable=False,
         )
 
     def __repr__(self):

diff --git a/qiskit_aer/backends/wrappers/aer_controller_binding.hpp b/qiskit_aer/backends/wrappers/aer_controller_binding.hpp
@@ -412,6 +412,14 @@ void bind_aer_controller(MODULE m) {
       "target_gpus",
       [](const Config &config) { return config.target_gpus.val; },
       [](Config &config, reg_t val) { config.target_gpus.value(val); });
+  aer_config.def_property(
+      "runtime_parameter_bind_enable",
+      [](const Config &config) {
+        return config.runtime_parameter_bind_enable.val;
+      },
+      [](Config &config, bool val) {
+        config.runtime_parameter_bind_enable.value(val);
+      });
 
   aer_config.def(py::pickle(
       [](const AER::Config &config) {
@@ -500,11 +508,12 @@ void bind_aer_controller(MODULE m) {
                 79, config.extended_stabilizer_norm_estimation_default_samples),
             write_value(80, config.shot_branching_enable),
             write_value(81, config.shot_branching_sampling_enable),
-            write_value(82, config.target_gpus));
+            write_value(82, config.target_gpus),
+            write_value(83, config.runtime_parameter_bind_enable));
       },
       [](py::tuple t) {
         AER::Config config;
-        if (t.size() != 82)
+        if (t.size() != 84)
           throw std::runtime_error("Invalid serialization format.");
 
         read_value(t, 0, config.shots);
@@ -594,6 +603,7 @@ void bind_aer_controller(MODULE m) {
         read_value(t, 80, config.shot_branching_enable);
         read_value(t, 81, config.shot_branching_sampling_enable);
         read_value(t, 82, config.target_gpus);
+        read_value(t, 83, config.runtime_parameter_bind_enable);
         return config;
       }));
 }

diff --git a/releasenotes/notes/add_executor-ba4870f86ed5d8ec.yaml b/releasenotes/notes/add_executor-ba4870f86ed5d8ec.yaml
@@ -0,0 +1,30 @@
+---
+features:
+  - |
+    This release restructures ``State`` classes.
+    Adding circuit executor classes that runs a circuit and manages multiple
+    states for multi-shots simulations or multi-chunk simulations for large
+    number of qubits.
+    Previously ``StateChunk`` class manages multiple chunks for multi-shots or
+    multi-chunk simulations but now ``State`` class only has one state
+    and all the parallelization codes are moved to ``Executor`` classes.
+    Now all ``State`` classes are independent from parallelization.
+    Also some of the functions in ``Aer::Controller`` class are moved to
+    ``CircuitExecutor::Executor`` class.
+  - |
+    Shot-branching technique that accelerates dynamic circuits simulations
+    is implemented with restructured ``Executor`` classes.
+    Shot-branching is currently applicable to statevector, density_matrix
+    and tensor_network methods.
+    Shot-branching provides dynamic distribution of multi-shots
+    by branching states when applying dynamic operations
+    (measure, reset, initialize, noises)
+    By default ``shot_branching_enable`` is disabled.
+    And by setting ``shot_branching_sampling_enable``, final measures will be
+    done by sampling measure that will speed up to get counts for multiple shots
+    sharing the same state.
+  - |
+    New option for GPU simulation ``target_gpus`` is added.
+    A list of GPUs used for the simulation can be set by this option.
+    Without this option, all the available GPUs are used.
+    For example, if there is 4 GPUs, ``target_gpus=[0, 2]`` will use 2 GPUs.
diff --git a/releasenotes/notes/runtime_parameter_binding-d2c57255f02729a1.yaml b/releasenotes/notes/runtime_parameter_binding-d2c57255f02729a1.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    A runtime parameter binding option is implemented to bind paramters at
+    runtime to a single circuit instead running multiple circuits as input.
+    An option ``runtime_parameter_bind_enable=True`` enables this feature and
+    for GPU, ``batched_shots_gpu=True`` should be also set to speed up
+    simulating parameterized circuit.
diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp
@@ -194,6 +194,9 @@ class Controller {
   int myrank_ = 0;
   int num_processes_ = 1;
   int num_process_per_experiment_ = 1;
+
+  // runtime parameter binding
+  bool runtime_parameter_bind_ = false;
 };
 
 //=========================================================================
@@ -329,6 +332,10 @@ void Controller::set_config(const Config &config) {
     throw std::runtime_error(std::string("Invalid simulation precision (") +
                              precision + std::string(")."));
   }
+
+  // check if runtime binding is enable
+  if (config.runtime_parameter_bind_enable.has_value())
+    runtime_parameter_bind_ = config.runtime_parameter_bind_enable.value();
 }
 
 void Controller::clear_config() {
@@ -502,7 +509,14 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
   auto methods = simulation_methods(config, circuits, noise_model);
 
   // Initialize Result object for the given number of experiments
-  Result result(circuits.size());
+  uint_t result_size;
+  reg_t result_offset(circuits.size());
+  result_size = 0;
+  for (int_t i = 0; i < circuits.size(); i++) {
+    result_offset[i] = result_size;
+    result_size += circuits[i]->num_bind_params;
+  }
+  Result result(result_size);
   // Initialize circuit executors for each circuit
   std::vector<std::shared_ptr<CircuitExecutor::Base>> executors(
       circuits.size());
@@ -514,12 +528,15 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
 
     // set parallelization for experiments
     try {
+      uint_t res_pos = 0;
       for (int i = 0; i < circuits.size(); i++) {
         executors[i] = make_circuit_executor(methods[i]);
         required_memory_mb_list[i] =
             executors[i]->required_memory_mb(config, *circuits[i], noise_model);
-        result.results[i].metadata.add(required_memory_mb_list[i],
-                                       "required_memory_mb");
+        for (int j = 0; j < circuits[i]->num_bind_params; j++) {
+          result.results[res_pos++].metadata.add(required_memory_mb_list[i],
+                                                 "required_memory_mb");
+        }
       }
       set_parallelization_experiments(required_memory_mb_list);
     } catch (std::exception &e) {
@@ -565,41 +582,48 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
     // average random seed to set the same seed to each process (when
     // seed_simulator is not set)
     if (num_processes_ > 1) {
-      reg_t seeds(circuits.size());
-      reg_t avg_seeds(circuits.size());
-      for (int_t i = 0; i < circuits.size(); i++)
-        seeds[i] = circuits[i]->seed;
-      MPI_Allreduce(seeds.data(), avg_seeds.data(), circuits.size(),
-                    MPI_UINT64_T, MPI_SUM, MPI_COMM_WORLD);
-      for (int_t i = 0; i < circuits.size(); i++)
-        circuits[i]->seed = avg_seeds[i] / num_processes_;
-    }
-#endif
-
-    const int NUM_RESULTS = result.results.size();
-    // following looks very similar but we have to separate them to avoid omp
-    // nested loops that causes performance degradation (DO NOT use if statement
-    // in #pragma omp)
-    if (parallel_experiments_ == 1) {
-      for (int i = 0; i < NUM_RESULTS; i++) {
-        executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i],
-                                  sim_device_, result.results[i]);
+      reg_t seeds(result_size);
+      reg_t avg_seeds(result_size);
+      int_t iseed = 0;
+      for (int_t i = 0; i < circuits.size(); i++) {
+        if (circuits[i]->num_bind_params > 1) {
+          for (int_t j = 0; i < circuits[i]->num_bind_params; i++)
+            seeds[iseed++] = circuits[i]->seed_for_params[j];
+        } else
+          seeds[iseed++] = circuits[i]->seed;
       }
-    } else {
-#pragma omp parallel for num_threads(parallel_experiments_)
-      for (int i = 0; i < NUM_RESULTS; i++) {
-        executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i],
-                                  sim_device_, result.results[i]);
+      MPI_Allreduce(seeds.data(), avg_seeds.data(), result_size, MPI_UINT64_T,
+                    MPI_SUM, MPI_COMM_WORLD);
+      iseed = 0;
+      for (int_t i = 0; i < circuits.size(); i++) {
+        if (circuits[i]->num_bind_params > 1) {
+          for (int_t j = 0; i < circuits[i]->num_bind_params; i++)
+            circuits[i]->seed_for_params[j] =
+                avg_seeds[iseed++] / num_processes_;
+        } else
+          circuits[i]->seed = avg_seeds[iseed++] / num_processes_;
       }
     }
+#endif
+
+    auto run_circuits = [this, &executors, &circuits, &noise_model, &config,
+                         &methods, &result, &result_offset](int_t i) {
+      executors[i]->run_circuit(*circuits[i], noise_model, config, methods[i],
+                                sim_device_,
+                                result.results.begin() + result_offset[i]);
+    };
+    Utils::apply_omp_parallel_for((parallel_experiments_ > 1), 0,
+                                  circuits.size(), run_circuits,
+                                  parallel_experiments_);
+
     executors.clear();
 
     // Check each experiment result for completed status.
     // If only some experiments completed return partial completed status.
 
     bool all_failed = true;
     result.status = Result::Status::completed;
-    for (int i = 0; i < NUM_RESULTS; ++i) {
+    for (int i = 0; i < result.results.size(); ++i) {
       auto &experiment = result.results[i];
       if (experiment.status == ExperimentResult::Status::completed) {
         all_failed = false;