[WIP] Add new multithreaded TwoQubitPeepholeOptimization pass

This commit adds a new transpiler pass for physical optimization, TwoQubitPeepholeOptimization. This replaces the use of Collect2qBlocks, ConsolidateBlocks, and UnitarySynthesis in the optimization stage for a default pass manager setup. The pass logically works the same way where it analyzes the dag to get a list of 2q runs, calculates the matrix of each run, and then synthesizes the matrix and substitutes it inplace. The distinction this pass makes though is it does this all in a single pass and also parallelizes the matrix calculation and synthesis steps because there is no data dependency there. This new pass is not meant to fully replace the Collect2qBlocks, ConsolidateBlocks, or UnitarySynthesis passes as those also run in contexts where we don't have a physical circuit. This is meant instead to replace their usage in the optimization stage only. Accordingly this new pass also changes the logic on how we select the synthesis to use and when to make a substituion. Previously this logic was primarily done via the ConsolidateBlocks pass by only consolidating to a UnitaryGate if the number of basis gates needed based on the weyl chamber coordinates was less than the number of 2q gates in the block (see #11659 for discussion on this). Since this new pass skips the explicit consolidation stage we go ahead and try all the available synthesizers Right now this commit has a number of limitations, the largest are: - Doesn't support builds with the py-cache feature (`OnceCell` for the cache can't be used across threads) - Only supports the target - It doesn't support any synthesizers besides the TwoQubitBasisDecomposer, because it's the only one in rust currently. For plugin handling I left the logic as running the three pass series, but I'm not sure this is the behavior we want. We could say keep the synthesis plugins for `UnitarySynthesis` only and then rely on our built-in methods for physical optimiztion only. But this also seems less than ideal because the plugin mechanism is how we support synthesizing to custom basis gates, and also more advanced approximate synthesis methods. Both of those are things we need to do as part of the synthesis here. Additionally, this is currently missing tests and documentation and while running it manually "works" as in it returns a circuit that looks valid, I've not done any validation yet. This also likely will need several rounds of performance optimization and tuning. t this point this is just a rough proof of concept and will need a lof refinement along with larger changes to Qiskit's rust code before this is ready to merge. Fixes #12007 Fixes #11659
Qiskit · Sep 18, 2024 · bd43c51 · bd43c51
1 parent c68e803
commit bd43c51
Show file tree

Hide file tree

Showing 12 changed files with 543 additions and 29 deletions.
diff --git a/crates/accelerate/Cargo.toml b/crates/accelerate/Cargo.toml
@@ -9,6 +9,10 @@ license.workspace = true
 name = "qiskit_accelerate"
 doctest = false
 
+
+[features]
+cache_pygates = ["qiskit-circuit/cache_pygates"]
+
 [dependencies]
 rayon.workspace = true
 numpy.workspace = true

diff --git a/crates/accelerate/src/convert_2q_block_matrix.rs b/crates/accelerate/src/convert_2q_block_matrix.rs
@@ -73,28 +73,33 @@ pub fn blocks_to_matrix(
     for bit in index_map {
         bit_map.add(py, bit.bind(py), true)?;
     }
+    let matrix = compose_2q_matrix(op_list.iter().map(|node| {
+        let matrix = get_matrix_from_inst(py, &node.instruction)?;
+        let bit_indices = bit_map
+            .map_bits(node.instruction.qubits.bind(py).iter())?
+            .map(|x| x as u8)
+            .collect::<SmallVec<_>>();
+        Ok((matrix, bit_indices))
+    }))?;
+    Ok(matrix.into_pyarray_bound(py).unbind())
+}
+
+pub fn compose_2q_matrix<'a>(
+    mut matrices: impl Iterator<Item = PyResult<(Array2<Complex64>, SmallVec<[u8; 2]>)>> + 'a,
+) -> PyResult<Array2<Complex64>> {
     let identity = aview2(&ONE_QUBIT_IDENTITY);
-    let first_node = &op_list[0];
-    let input_matrix = get_matrix_from_inst(py, &first_node.instruction)?;
-    let mut matrix: Array2<Complex64> = match bit_map
-        .map_bits(first_node.instruction.qubits.bind(py).iter())?
-        .collect::<Vec<_>>()
-        .as_slice()
-    {
+    let (input_matrix, bit_map) = matrices.next().unwrap()?;
+
+    let mut matrix: Array2<Complex64> = match bit_map.as_slice() {
         [0] => kron(&identity, &input_matrix),
         [1] => kron(&input_matrix, &identity),
         [0, 1] => input_matrix,
         [1, 0] => change_basis(input_matrix.view()),
         [] => Array2::eye(4),
         _ => unreachable!(),
     };
-    for node in op_list.into_iter().skip(1) {
-        let op_matrix = get_matrix_from_inst(py, &node.instruction)?;
-        let q_list = bit_map
-            .map_bits(node.instruction.qubits.bind(py).iter())?
-            .map(|x| x as u8)
-            .collect::<SmallVec<[u8; 2]>>();
-
+    for raw_data in matrices {
+        let (op_matrix, q_list) = raw_data?;
         let result = match q_list.as_slice() {
             [0] => Some(kron(&identity, &op_matrix)),
             [1] => Some(kron(&op_matrix, &identity)),
@@ -107,7 +112,7 @@ pub fn blocks_to_matrix(
             None => op_matrix.dot(&matrix),
         };
     }
-    Ok(matrix.into_pyarray_bound(py).unbind())
+    Ok(matrix)
 }
 
 /// Switches the order of qubits in a two qubit operation.

diff --git a/crates/accelerate/src/euler_one_qubit_decomposer.rs b/crates/accelerate/src/euler_one_qubit_decomposer.rs
@@ -579,7 +579,7 @@ pub fn generate_circuit(
 
 const EULER_BASIS_SIZE: usize = 12;
 
-static EULER_BASES: [&[&str]; EULER_BASIS_SIZE] = [
+pub static EULER_BASES: [&[&str]; EULER_BASIS_SIZE] = [
     &["u3"],
     &["u3", "u2", "u1"],
     &["u"],
@@ -593,7 +593,7 @@ static EULER_BASES: [&[&str]; EULER_BASIS_SIZE] = [
     &["rz", "sx", "x"],
     &["rz", "sx"],
 ];
-static EULER_BASIS_NAMES: [EulerBasis; EULER_BASIS_SIZE] = [
+pub static EULER_BASIS_NAMES: [EulerBasis; EULER_BASIS_SIZE] = [
     EulerBasis::U3,
     EulerBasis::U321,
     EulerBasis::U,

diff --git a/crates/accelerate/src/lib.rs b/crates/accelerate/src/lib.rs
@@ -42,6 +42,7 @@ pub mod stochastic_swap;
 pub mod synthesis;
 pub mod target_transpiler;
 pub mod two_qubit_decompose;
+pub mod two_qubit_peephole;
 pub mod uc_gate;
 pub mod utils;
 pub mod vf2_layout;

diff --git a/crates/accelerate/src/target_transpiler/mod.rs b/crates/accelerate/src/target_transpiler/mod.rs
@@ -31,6 +31,8 @@ use pyo3::{
     types::{PyDict, PyList, PySet, PyTuple},
 };
 
+use ndarray::Array2;
+use num_complex::Complex64;
 use qiskit_circuit::circuit_instruction::OperationFromPython;
 use qiskit_circuit::operations::{Operation, Param};
 use qiskit_circuit::packed_instruction::PackedOperation;
@@ -108,6 +110,12 @@ pub(crate) struct NormalOperation {
     op_object: PyObject,
 }
 
+impl NormalOperation {
+    pub fn matrix(&self) -> Option<Array2<Complex64>> {
+        self.operation.view().matrix(&self.params)
+    }
+}
+
 impl<'py> FromPyObject<'py> for NormalOperation {
     fn extract(ob: &'py PyAny) -> PyResult<Self> {
         let operation: OperationFromPython = ob.extract()?;

diff --git a/crates/accelerate/src/two_qubit_decompose.rs b/crates/accelerate/src/two_qubit_decompose.rs
@@ -1216,9 +1216,9 @@ type TwoQubitSequenceVec = Vec<(Option<StandardGate>, SmallVec<[f64; 3]>, SmallV
 
 #[pyclass(sequence)]
 pub struct TwoQubitGateSequence {
-    gates: TwoQubitSequenceVec,
+    pub gates: TwoQubitSequenceVec,
     #[pyo3(get)]
-    global_phase: f64,
+    pub global_phase: f64,
 }
 
 #[pymethods]
@@ -1287,6 +1287,10 @@ pub struct TwoQubitBasisDecomposer {
     q2r: Array2<Complex64>,
 }
 impl TwoQubitBasisDecomposer {
+    pub fn gate_name(&self) -> &str {
+        self.gate.as_str()
+    }
+
     fn decomp1_inner(
         &self,
         target: &TwoQubitWeylDecomposition,
@@ -1643,11 +1647,11 @@ impl TwoQubitBasisDecomposer {
         Ok(res)
     }
 
-    fn new_inner(
+    pub fn new_inner(
         gate: String,
         gate_matrix: ArrayView2<Complex64>,
         basis_fidelity: f64,
-        euler_basis: &str,
+        euler_basis: EulerBasis,
         pulse_optimize: Option<bool>,
     ) -> PyResult<Self> {
         let ipz: ArrayView2<Complex64> = aview2(&IPZ);
@@ -1755,7 +1759,7 @@ impl TwoQubitBasisDecomposer {
         Ok(TwoQubitBasisDecomposer {
             gate,
             basis_fidelity,
-            euler_basis: EulerBasis::__new__(euler_basis)?,
+            euler_basis,
             pulse_optimize,
             basis_decomposer,
             super_controlled,
@@ -1781,7 +1785,7 @@ impl TwoQubitBasisDecomposer {
         })
     }
 
-    fn call_inner(
+    pub fn call_inner(
         &self,
         unitary: ArrayView2<Complex64>,
         basis_fidelity: Option<f64>,
@@ -1924,7 +1928,7 @@ impl TwoQubitBasisDecomposer {
             gate,
             gate_matrix.as_array(),
             basis_fidelity,
-            euler_basis,
+            EulerBasis::__new__(euler_basis)?,
             pulse_optimize,
         )
     }
@@ -2222,8 +2226,13 @@ fn two_qubit_decompose_up_to_diagonal(
     let (su4, phase) = u4_to_su4(mat_arr);
     let mut real_map = real_trace_transform(su4.view());
     let mapped_su4 = real_map.dot(&su4.view());
-    let decomp =
-        TwoQubitBasisDecomposer::new_inner("cx".to_string(), aview2(&CX_GATE), 1.0, "U", None)?;
+    let decomp = TwoQubitBasisDecomposer::new_inner(
+        "cx".to_string(),
+        aview2(&CX_GATE),
+        1.0,
+        EulerBasis::__new__("U")?,
+        None,
+    )?;
 
     let circ_seq = decomp.call_inner(mapped_su4.view(), None, true, None)?;
     let circ = CircuitData::from_standard_gates(