From cf249b4d385ebe40c9a68444c447c50feedf1d20 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:03:40 -0400 Subject: [PATCH 01/10] bump version to 0.4.0 --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e3775c1..b28300a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -806,7 +806,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "phenolrs" -version = "0.3.0" +version = "0.4.0" dependencies = [ "anyhow", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 2b89370..1f95220 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "phenolrs" -version = "0.3.0" +version = "0.4.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html From 8d606a721363334366c576352ac446dcbcda16bf Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:03:51 -0400 Subject: [PATCH 02/10] add `arango-datasets` as test dep --- pyproject.toml | 1 + temp.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 temp.py diff --git a/pyproject.toml b/pyproject.toml index 2dc5dc8..63d188b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ [project.optional-dependencies] tests = [ "pytest", + "arango-datasets" ] dynamic = ["version"] diff --git a/temp.py b/temp.py new file mode 100644 index 0000000..5a5604d --- /dev/null +++ b/temp.py @@ -0,0 +1,15 @@ +from python.phenolrs.pyg_loader import PygLoader + +res = PygLoader.load_into_pyg_heterodata( + "abide", + { + "vertexCollections": {"Subjects": {"x": "brain_fmri_features"}}, + "edgeCollections": {"medical_affinity_graph": {'a': 'x'}}, + }, + ["http://localhost:8529"], + None, + "root", + "passwd", + batch_size=1000000, + parallelism=10, +) From 0699599fc58a74e445c1fe36887596888c246af9 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:04:11 -0400 Subject: [PATCH 03/10] new: return two mappings --- python/phenolrs/numpy_loader.py | 20 ++++++++++++-------- python/phenolrs/phenolrs.pyi | 1 + python/phenolrs/pyg_loader.py | 7 ++++--- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/python/phenolrs/numpy_loader.py b/python/phenolrs/numpy_loader.py index e05ad6c..0ca901a 100644 --- a/python/phenolrs/numpy_loader.py +++ b/python/phenolrs/numpy_loader.py @@ -22,6 +22,7 @@ def load_graph_to_numpy( dict[str, dict[str, npt.NDArray[np.float64]]], dict[typing.Tuple[str, str, str], npt.NDArray[np.float64]], dict[str, dict[str, int]], + dict[str, dict[int, str]], dict[str, dict[str, str]], ]: # TODO: replace with pydantic validation @@ -83,18 +84,21 @@ def load_graph_to_numpy( for e_col_name, entries in metagraph["edgeCollections"].items() ] - features_by_col, coo_map, col_to_adb_id_to_ind = graph_to_pyg_format( - { - "database": database, - "vertex_collections": vertex_collections, - "edge_collections": edge_collections, - "configuration": {"database_config": db_config_options}, - } + features_by_col, coo_map, col_to_adb_key_to_ind, col_to_ind_to_adb_key = ( + graph_to_pyg_format( + { + "database": database, + "vertex_collections": vertex_collections, + "edge_collections": edge_collections, + "configuration": {"database_config": db_config_options}, + } + ) ) return ( features_by_col, coo_map, - col_to_adb_id_to_ind, + col_to_adb_key_to_ind, + col_to_ind_to_adb_key, vertex_cols_source_to_output, ) diff --git a/python/phenolrs/phenolrs.pyi b/python/phenolrs/phenolrs.pyi index 3baf9fc..c4ad824 100644 --- a/python/phenolrs/phenolrs.pyi +++ b/python/phenolrs/phenolrs.pyi @@ -7,6 +7,7 @@ def graph_to_pyg_format(request: dict[str, typing.Any]) -> typing.Tuple[ dict[str, dict[str, npt.NDArray[np.float64]]], dict[typing.Tuple[str, str, str], npt.NDArray[np.float64]], dict[str, dict[str, int]], + dict[str, dict[int, str]], ]: ... class PhenolError(Exception): ... diff --git a/python/phenolrs/pyg_loader.py b/python/phenolrs/pyg_loader.py index 3e81d53..7b5b861 100644 --- a/python/phenolrs/pyg_loader.py +++ b/python/phenolrs/pyg_loader.py @@ -98,7 +98,7 @@ def load_into_pyg_heterodata( tls_cert: typing.Any | None = None, parallelism: int | None = None, batch_size: int | None = None, - ) -> tuple[HeteroData, dict[str, dict[str, int]]]: + ) -> tuple[HeteroData, dict[str, dict[str, int]], dict[str, dict[int, str]]]: if "vertexCollections" not in metagraph: raise PhenolError("vertexCollections not found in metagraph") if "edgeCollections" not in metagraph: @@ -112,7 +112,8 @@ def load_into_pyg_heterodata( ( features_by_col, coo_map, - col_to_adb_id_to_ind, + col_to_adb_key_to_ind, + col_to_ind_to_adb_key, vertex_cols_source_to_output, ) = NumpyLoader.load_graph_to_numpy( database, @@ -142,4 +143,4 @@ def load_into_pyg_heterodata( if result.numel() > 0: data[(from_name, edge_col_name, to_name)].edge_index = result - return data, col_to_adb_id_to_ind + return data, col_to_adb_key_to_ind, col_to_ind_to_adb_key From 34b1b527889904fec2ab474b91c8602617e7b3ae Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:05:26 -0400 Subject: [PATCH 04/10] update tests --- python/tests/test_all.py | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/python/tests/test_all.py b/python/tests/test_all.py index 4b34d6c..276df74 100644 --- a/python/tests/test_all.py +++ b/python/tests/test_all.py @@ -20,10 +20,16 @@ def test_phenol_abide_hetero( password=connection_information["password"], ) - data, col_to_adb_id_to_ind = result + data, col_to_adb_key_to_ind, col_to_ind_to_adb_key = result assert isinstance(data, HeteroData) assert data["Subjects"]["x"].shape == (871, 2000) - assert len(col_to_adb_id_to_ind["Subjects"]) == 871 + assert ( + len(col_to_adb_key_to_ind["Subjects"]) + == len(col_to_ind_to_adb_key["Subjects"]) + == 871 + ) + + assert data[("Subjects", "medical_affinity_graph", "Subjects")]["edge_index"].shape == (2, 606770) # Metagraph variation result = PygLoader.load_into_pyg_heterodata( @@ -39,10 +45,16 @@ def test_phenol_abide_hetero( password=connection_information["password"], ) - data, col_to_adb_id_to_ind = result + data, col_to_adb_key_to_ind, col_to_ind_to_adb_key = result assert isinstance(data, HeteroData) assert data["Subjects"]["x"].shape == (871, 2000) - assert len(col_to_adb_id_to_ind["Subjects"]) == 871 + assert ( + len(col_to_adb_key_to_ind["Subjects"]) + == len(col_to_ind_to_adb_key["Subjects"]) + == 871 + ) + + assert data[("Subjects", "medical_affinity_graph", "Subjects")]["edge_index"].shape == (2, 606770) def test_phenol_abide_numpy( @@ -51,7 +63,8 @@ def test_phenol_abide_numpy( ( features_by_col, coo_map, - col_to_adb_id_to_ind, + col_to_adb_key_to_ind, + col_to_ind_to_adb_key, vertex_cols_source_to_output, ) = NumpyLoader.load_graph_to_numpy( connection_information["dbName"], @@ -65,17 +78,22 @@ def test_phenol_abide_numpy( ) assert features_by_col["Subjects"]["brain_fmri_features"].shape == (871, 2000) - assert coo_map[("medical_affinity_graph", "Subjects", "Subjects")].shape == ( + assert coo_map[("medical_affinity_graph", "Subjects", "Subjects")]["edge_index"].shape == ( 2, 606770, ) - assert len(col_to_adb_id_to_ind["Subjects"]) == 871 + assert ( + len(col_to_adb_key_to_ind["Subjects"]) + == len(col_to_ind_to_adb_key["Subjects"]) + == 871 + ) assert vertex_cols_source_to_output == {"Subjects": {"brain_fmri_features": "x"}} ( features_by_col, coo_map, - col_to_adb_id_to_ind, + col_to_adb_key_to_ind, + col_to_ind_to_adb_key, vertex_cols_source_to_output, ) = NumpyLoader.load_graph_to_numpy( connection_information["dbName"], @@ -90,5 +108,9 @@ def test_phenol_abide_numpy( assert features_by_col["Subjects"]["brain_fmri_features"].shape == (871, 2000) assert len(coo_map) == 0 - assert len(col_to_adb_id_to_ind["Subjects"]) == 871 + assert ( + len(col_to_adb_key_to_ind["Subjects"]) + == len(col_to_ind_to_adb_key["Subjects"]) + == 871 + ) assert vertex_cols_source_to_output == {"Subjects": {"brain_fmri_features": "x"}} From c36335d1a298587249e7f6c10e15c3b22bb52fa1 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:05:57 -0400 Subject: [PATCH 05/10] new: use `_key` instead of `_id`, and return reverse mapping --- src/graphs.rs | 27 ++++++++++++++------------- src/lib.rs | 20 +++++++++++++++++--- src/load/receive.rs | 6 ++++-- src/output/construct.rs | 16 ++++++++++++++++ 4 files changed, 51 insertions(+), 18 deletions(-) diff --git a/src/graphs.rs b/src/graphs.rs index f1ef508..2640ba3 100644 --- a/src/graphs.rs +++ b/src/graphs.rs @@ -189,19 +189,20 @@ impl Graph { to_id: Vec, _data: Vec, ) -> Result<()> { - // build up the coo representation - let from_col: String = String::from_utf8({ + let (from_col, from_key) = { let s = String::from_utf8(from_id.clone()).expect("_from to be a string"); - let id_split = s.find('/').unwrap(); - (&s[0..id_split]).into() - }) - .unwrap(); - let to_col: String = String::from_utf8({ + let id_split = s.find('/').expect("Invalid format for _from"); + let (col, key) = s.split_at(id_split); + (col.to_string(), key[1..].to_string()) + }; + + let (to_col, to_key) = { let s = String::from_utf8(to_id.clone()).expect("_to to be a string"); - let id_split = s.find('/').unwrap(); - (&s[0..id_split]).into() - }) - .unwrap(); + let id_split = s.find('/').expect("Invalid format for _to"); + let (col, key) = s.split_at(id_split); + (col.to_string(), key[1..].to_string()) + }; + let key_tup = ( String::from_utf8(col_name).unwrap(), from_col.clone(), @@ -223,8 +224,8 @@ impl Graph { .coo_by_from_edge_to .get_mut(&key_tup) .ok_or_else(|| anyhow!("Unable to get COO from to for {:?}", &key_tup))?; - let from_col_id = from_col_keys.get(&String::from_utf8(from_id).unwrap()); - let to_col_id = to_col_keys.get(&String::from_utf8(to_id).unwrap()); + let from_col_id = from_col_keys.get(&from_key); + let to_col_id = to_col_keys.get(&to_key); if let (Some(from_id), Some(to_id)) = (from_col_id, to_col_id) { cur_coo[0].push(*from_id); cur_coo[1].push(*to_id); diff --git a/src/lib.rs b/src/lib.rs index 29eb679..ddae9d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,7 +19,7 @@ use pyo3::prelude::*; use pyo3::types::PyDict; #[cfg(not(test))] -type PygCompatible<'a> = (&'a PyDict, &'a PyDict, &'a PyDict); +type PygCompatible<'a> = (&'a PyDict, &'a PyDict, &'a PyDict, &'a PyDict); #[cfg(not(test))] create_exception!(phenolrs, PhenolError, PyException); @@ -31,18 +31,32 @@ create_exception!(phenolrs, PhenolError, PyException); #[cfg(not(test))] fn graph_to_pyg_format(py: Python, request: DataLoadRequest) -> PyResult { let graph = load::retrieve::get_arangodb_graph(request).map_err(PhenolError::new_err)?; + let col_to_features = construct::construct_col_to_features( convert_nested_features_map(graph.cols_to_features), py, )?; + let coo_by_from_edge_to = construct::construct_coo_by_from_edge_to( convert_coo_edge_map(graph.coo_by_from_edge_to), py, )?; + let cols_to_keys_to_inds = - construct::construct_cols_to_keys_to_inds(graph.cols_to_keys_to_inds, py)?; + construct::construct_cols_to_keys_to_inds(graph.cols_to_keys_to_inds.clone(), py)?; + + let cols_to_inds_to_keys = + construct::construct_cols_to_inds_to_keys(graph.cols_to_keys_to_inds, py)?; + println!("Finished retrieval!"); - let res = (col_to_features, coo_by_from_edge_to, cols_to_keys_to_inds); + + let res = ( + col_to_features, + coo_by_from_edge_to, + cols_to_keys_to_inds, + cols_to_inds_to_keys, + ); + Ok(res) } diff --git a/src/load/receive.rs b/src/load/receive.rs index e61a394..669eca7 100644 --- a/src/load/receive.rs +++ b/src/load/receive.rs @@ -189,10 +189,11 @@ pub fn receive_vertices( Ok(val) => val, }; let id = &v["_id"]; + let key = &v["_key"]; match id { Value::String(i) => { let mut buf = vec![]; - buf.extend_from_slice(i[..].as_bytes()); + buf.extend_from_slice(key.as_str().unwrap().as_bytes()); vertex_keys.push(buf); if current_vertex_col.is_none() { let pos = i.find('/').unwrap(); @@ -255,10 +256,11 @@ pub fn receive_vertices( }; for v in values.result.into_iter() { let id = &v["_id"]; + let key = &v["_key"]; match id { Value::String(i) => { let mut buf = vec![]; - buf.extend_from_slice(i[..].as_bytes()); + buf.extend_from_slice(key.as_str().unwrap().as_bytes()); vertex_keys.push(buf); if current_vertex_col.is_none() { let pos = i.find('/').unwrap(); diff --git a/src/output/construct.rs b/src/output/construct.rs index 83338a8..23a3bfb 100644 --- a/src/output/construct.rs +++ b/src/output/construct.rs @@ -43,3 +43,19 @@ pub fn construct_cols_to_keys_to_inds( .for_each(|item| dict.set_item(item.0, item.1).unwrap()); Ok(dict) } + +#[cfg(not(test))] +pub fn construct_cols_to_inds_to_keys( + input: HashMap>, + py: Python, +) -> PyResult<&PyDict> { + let dict = PyDict::new(py); + input.iter().for_each(|(col_name, inner_map)| { + let inner_dict = PyDict::new(py); + inner_map.iter().for_each(|(key, value)| { + inner_dict.set_item(value, key).unwrap(); + }); + dict.set_item(col_name, inner_dict).unwrap(); + }); + Ok(dict) +} From 39f4a804ca8278be7860a04703c77c3848c41f1c Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:07:21 -0400 Subject: [PATCH 06/10] fix lint --- python/tests/test_all.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/tests/test_all.py b/python/tests/test_all.py index 276df74..5b7655f 100644 --- a/python/tests/test_all.py +++ b/python/tests/test_all.py @@ -29,7 +29,9 @@ def test_phenol_abide_hetero( == 871 ) - assert data[("Subjects", "medical_affinity_graph", "Subjects")]["edge_index"].shape == (2, 606770) + assert data[("Subjects", "medical_affinity_graph", "Subjects")][ + "edge_index" + ].shape == (2, 606770) # Metagraph variation result = PygLoader.load_into_pyg_heterodata( @@ -54,7 +56,9 @@ def test_phenol_abide_hetero( == 871 ) - assert data[("Subjects", "medical_affinity_graph", "Subjects")]["edge_index"].shape == (2, 606770) + assert data[("Subjects", "medical_affinity_graph", "Subjects")][ + "edge_index" + ].shape == (2, 606770) def test_phenol_abide_numpy( @@ -78,10 +82,9 @@ def test_phenol_abide_numpy( ) assert features_by_col["Subjects"]["brain_fmri_features"].shape == (871, 2000) - assert coo_map[("medical_affinity_graph", "Subjects", "Subjects")]["edge_index"].shape == ( - 2, - 606770, - ) + assert coo_map[("medical_affinity_graph", "Subjects", "Subjects")][ + "edge_index" + ].shape == (2, 606770) assert ( len(col_to_adb_key_to_ind["Subjects"]) == len(col_to_ind_to_adb_key["Subjects"]) From 28c857c84fc10733b5b43b29a9c60e9ef6333477 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:08:28 -0400 Subject: [PATCH 07/10] fix: typo --- python/tests/test_all.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/tests/test_all.py b/python/tests/test_all.py index 5b7655f..771692f 100644 --- a/python/tests/test_all.py +++ b/python/tests/test_all.py @@ -82,9 +82,10 @@ def test_phenol_abide_numpy( ) assert features_by_col["Subjects"]["brain_fmri_features"].shape == (871, 2000) - assert coo_map[("medical_affinity_graph", "Subjects", "Subjects")][ - "edge_index" - ].shape == (2, 606770) + assert coo_map[("medical_affinity_graph", "Subjects", "Subjects")].shape == ( + 2, + 606770, + ) assert ( len(col_to_adb_key_to_ind["Subjects"]) == len(col_to_ind_to_adb_key["Subjects"]) From 25254e3bb7f26b4dcb62893973afcf2c4d2660fe Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:12:51 -0400 Subject: [PATCH 08/10] remove temp file --- temp.py | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 temp.py diff --git a/temp.py b/temp.py deleted file mode 100644 index 5a5604d..0000000 --- a/temp.py +++ /dev/null @@ -1,15 +0,0 @@ -from python.phenolrs.pyg_loader import PygLoader - -res = PygLoader.load_into_pyg_heterodata( - "abide", - { - "vertexCollections": {"Subjects": {"x": "brain_fmri_features"}}, - "edgeCollections": {"medical_affinity_graph": {'a': 'x'}}, - }, - ["http://localhost:8529"], - None, - "root", - "passwd", - batch_size=1000000, - parallelism=10, -) From 7333d9f3a966de0ffa2fd65116769aadd40e75f2 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:14:25 -0400 Subject: [PATCH 09/10] fix: lint --- python/phenolrs/pyg_loader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/phenolrs/pyg_loader.py b/python/phenolrs/pyg_loader.py index 7b5b861..977176d 100644 --- a/python/phenolrs/pyg_loader.py +++ b/python/phenolrs/pyg_loader.py @@ -20,7 +20,7 @@ def load_into_pyg_data( tls_cert: typing.Any | None = None, parallelism: int | None = None, batch_size: int | None = None, - ) -> tuple[Data, dict[str, dict[str, int]]]: + ) -> tuple[Data, dict[str, dict[str, int]], dict[str, dict[int, str]]]: if "vertexCollections" not in metagraph: raise PhenolError("vertexCollections not found in metagraph") if "edgeCollections" not in metagraph: @@ -46,7 +46,8 @@ def load_into_pyg_data( ( features_by_col, coo_map, - col_to_adb_id_to_ind, + col_to_adb_key_to_ind, + col_to_ind_to_adb_key, vertex_cols_source_to_output, ) = NumpyLoader.load_graph_to_numpy( database, @@ -85,7 +86,7 @@ def load_into_pyg_data( if result.numel() > 0: data["edge_index"] = result - return data, col_to_adb_id_to_ind + return data, col_to_adb_key_to_ind, col_to_ind_to_adb_key @staticmethod def load_into_pyg_heterodata( From 0e75d1322b085ac037b1d603e529088861e55c56 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Wed, 15 May 2024 20:31:18 -0400 Subject: [PATCH 10/10] fix: return `_key` if vertex --- src/arangodb/aql.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arangodb/aql.rs b/src/arangodb/aql.rs index c6310c4..b2cc99e 100644 --- a/src/arangodb/aql.rs +++ b/src/arangodb/aql.rs @@ -246,7 +246,7 @@ fn build_aql_query(collection_description: &CollectionDescription, is_edge: bool let identifiers = if is_edge { "_to: doc._to,\n_from: doc._from,\n" } else { - "" + "_key: doc._key,\n" }; let query = format!( "