From 6efb1c628973743c01945ffb724be2271eaede1e Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 27 Feb 2024 13:53:26 -0600 Subject: [PATCH 1/2] nx-cugraph: automatically generate trees in README.md (#4156) This updates how we create trees. Also, CI now tests that auto-generated files are up-to-date (not updating these has gotten me a couple of times). Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cugraph/pull/4156 --- .gitignore | 2 + .pre-commit-config.yaml | 3 +- ci/test_python.sh | 7 + python/nx-cugraph/Makefile | 13 +- python/nx-cugraph/README.md | 270 +++++++++--------- python/nx-cugraph/lint.yaml | 12 +- .../algorithms/link_analysis/hits_alg.py | 6 +- .../algorithms/link_analysis/pagerank_alg.py | 6 +- python/nx-cugraph/nx_cugraph/classes/graph.py | 3 +- .../nx_cugraph/classes/multigraph.py | 4 +- python/nx-cugraph/nx_cugraph/interface.py | 6 +- .../nx_cugraph/scripts/print_table.py | 3 +- .../nx_cugraph/scripts/print_tree.py | 122 +++++--- python/nx-cugraph/pyproject.toml | 13 +- python/nx-cugraph/scripts/update_readme.py | 203 +++++++++++++ 15 files changed, 460 insertions(+), 213 deletions(-) create mode 100644 python/nx-cugraph/scripts/update_readme.py diff --git a/.gitignore b/.gitignore index 358650cfc5a..2fea1022910 100644 --- a/.gitignore +++ b/.gitignore @@ -78,6 +78,8 @@ datasets/* !datasets/karate-disjoint.csv !datasets/netscience.csv +# nx-cugraph side effects +python/nx-cugraph/objects.inv .pydevproject diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 188ea1a266a..6b7ff14417c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,8 +15,9 @@ repos: hooks: - id: black language_version: python3 - args: [--target-version=py38] + args: [--target-version=py39] files: ^(python/.*|benchmarks/.*)$ + exclude: ^python/nx-cugraph/ - repo: https://github.com/PyCQA/flake8 rev: 6.0.0 hooks: diff --git a/ci/test_python.sh b/ci/test_python.sh index 8fa9a90ae69..9fa1de2e5e7 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -127,6 +127,13 @@ python -m nx_cugraph.scripts.print_tree --dispatch-name --plc --incomplete --dif python -m nx_cugraph.scripts.print_table popd +rapids-logger "ensure nx-cugraph autogenerated files are up to date" +pushd python/nx-cugraph +make || true +git diff --exit-code . +git checkout . +popd + rapids-logger "pytest cugraph-service (single GPU)" ./ci/run_cugraph_service_pytests.sh \ --verbose \ diff --git a/python/nx-cugraph/Makefile b/python/nx-cugraph/Makefile index 6e1b98ee6e9..6500d834ee7 100644 --- a/python/nx-cugraph/Makefile +++ b/python/nx-cugraph/Makefile @@ -1,12 +1,12 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. SHELL= /bin/bash .PHONY: all -all: plugin-info lint +all: plugin-info lint readme .PHONY: lint lint: - git ls-files | xargs pre-commit run --config lint.yaml --files + git ls-files | xargs pre-commit run --config lint.yaml --files || true .PHONY: lint-update lint-update: @@ -15,3 +15,10 @@ lint-update: .PHONY: plugin-info plugin-info: python _nx_cugraph/__init__.py + +objects.inv: + wget https://networkx.org/documentation/stable/objects.inv + +.PHONY: readme +readme: objects.inv + python scripts/update_readme.py README.md objects.inv diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md index 5d0554734a8..8201dc34eb2 100644 --- a/python/nx-cugraph/README.md +++ b/python/nx-cugraph/README.md @@ -91,144 +91,144 @@ familiar and easy-to-use API. Below is the list of algorithms that are currently supported in nx-cugraph. -### Algorithms - -``` -bipartite - ├─ basic - │ └─ is_bipartite - └─ generators - └─ complete_bipartite_graph -centrality - ├─ betweenness - │ ├─ betweenness_centrality - │ └─ edge_betweenness_centrality - ├─ degree_alg - │ ├─ degree_centrality - │ ├─ in_degree_centrality - │ └─ out_degree_centrality - ├─ eigenvector - │ └─ eigenvector_centrality - └─ katz - └─ katz_centrality -cluster - ├─ average_clustering - ├─ clustering - ├─ transitivity - └─ triangles -community - └─ louvain - └─ louvain_communities -components - ├─ connected - │ ├─ connected_components - │ ├─ is_connected - │ ├─ node_connected_component - │ └─ number_connected_components - └─ weakly_connected - ├─ is_weakly_connected - ├─ number_weakly_connected_components - └─ weakly_connected_components -core - ├─ core_number - └─ k_truss -dag - ├─ ancestors - └─ descendants -isolate - ├─ is_isolate - ├─ isolates - └─ number_of_isolates -link_analysis - ├─ hits_alg - │ └─ hits - └─ pagerank_alg - └─ pagerank -operators - └─ unary - ├─ complement - └─ reverse -reciprocity - ├─ overall_reciprocity - └─ reciprocity -shortest_paths - └─ unweighted - ├─ single_source_shortest_path_length - └─ single_target_shortest_path_length -traversal - └─ breadth_first_search - ├─ bfs_edges - ├─ bfs_layers - ├─ bfs_predecessors - ├─ bfs_successors - ├─ bfs_tree - ├─ descendants_at_distance - └─ generic_bfs_edges -tree - └─ recognition - ├─ is_arborescence - ├─ is_branching - ├─ is_forest - └─ is_tree -``` - -### Generators - -``` -classic - ├─ barbell_graph - ├─ circular_ladder_graph - ├─ complete_graph - ├─ complete_multipartite_graph - ├─ cycle_graph - ├─ empty_graph - ├─ ladder_graph - ├─ lollipop_graph - ├─ null_graph - ├─ path_graph - ├─ star_graph - ├─ tadpole_graph - ├─ trivial_graph - ├─ turan_graph - └─ wheel_graph -community - └─ caveman_graph -small - ├─ bull_graph - ├─ chvatal_graph - ├─ cubical_graph - ├─ desargues_graph - ├─ diamond_graph - ├─ dodecahedral_graph - ├─ frucht_graph - ├─ heawood_graph - ├─ house_graph - ├─ house_x_graph - ├─ icosahedral_graph - ├─ krackhardt_kite_graph - ├─ moebius_kantor_graph - ├─ octahedral_graph - ├─ pappus_graph - ├─ petersen_graph - ├─ sedgewick_maze_graph - ├─ tetrahedral_graph - ├─ truncated_cube_graph - ├─ truncated_tetrahedron_graph - └─ tutte_graph -social - ├─ davis_southern_women_graph - ├─ florentine_families_graph - ├─ karate_club_graph - └─ les_miserables_graph -``` +### [Algorithms](https://networkx.org/documentation/latest/reference/algorithms/index.html) + +
+bipartite
+ ├─ basic
+ │   └─ is_bipartite
+ └─ generators
+     └─ complete_bipartite_graph
+centrality
+ ├─ betweenness
+ │   ├─ betweenness_centrality
+ │   └─ edge_betweenness_centrality
+ ├─ degree_alg
+ │   ├─ degree_centrality
+ │   ├─ in_degree_centrality
+ │   └─ out_degree_centrality
+ ├─ eigenvector
+ │   └─ eigenvector_centrality
+ └─ katz
+     └─ katz_centrality
+cluster
+ ├─ average_clustering
+ ├─ clustering
+ ├─ transitivity
+ └─ triangles
+community
+ └─ louvain
+     └─ louvain_communities
+components
+ ├─ connected
+ │   ├─ connected_components
+ │   ├─ is_connected
+ │   ├─ node_connected_component
+ │   └─ number_connected_components
+ └─ weakly_connected
+     ├─ is_weakly_connected
+     ├─ number_weakly_connected_components
+     └─ weakly_connected_components
+core
+ ├─ core_number
+ └─ k_truss
+dag
+ ├─ ancestors
+ └─ descendants
+isolate
+ ├─ is_isolate
+ ├─ isolates
+ └─ number_of_isolates
+link_analysis
+ ├─ hits_alg
+ │   └─ hits
+ └─ pagerank_alg
+     └─ pagerank
+operators
+ └─ unary
+     ├─ complement
+     └─ reverse
+reciprocity
+ ├─ overall_reciprocity
+ └─ reciprocity
+shortest_paths
+ └─ unweighted
+     ├─ single_source_shortest_path_length
+     └─ single_target_shortest_path_length
+traversal
+ └─ breadth_first_search
+     ├─ bfs_edges
+     ├─ bfs_layers
+     ├─ bfs_predecessors
+     ├─ bfs_successors
+     ├─ bfs_tree
+     ├─ descendants_at_distance
+     └─ generic_bfs_edges
+tree
+ └─ recognition
+     ├─ is_arborescence
+     ├─ is_branching
+     ├─ is_forest
+     └─ is_tree
+
+ +### [Generators](https://networkx.org/documentation/latest/reference/generators.html) + +
+classic
+ ├─ barbell_graph
+ ├─ circular_ladder_graph
+ ├─ complete_graph
+ ├─ complete_multipartite_graph
+ ├─ cycle_graph
+ ├─ empty_graph
+ ├─ ladder_graph
+ ├─ lollipop_graph
+ ├─ null_graph
+ ├─ path_graph
+ ├─ star_graph
+ ├─ tadpole_graph
+ ├─ trivial_graph
+ ├─ turan_graph
+ └─ wheel_graph
+community
+ └─ caveman_graph
+small
+ ├─ bull_graph
+ ├─ chvatal_graph
+ ├─ cubical_graph
+ ├─ desargues_graph
+ ├─ diamond_graph
+ ├─ dodecahedral_graph
+ ├─ frucht_graph
+ ├─ heawood_graph
+ ├─ house_graph
+ ├─ house_x_graph
+ ├─ icosahedral_graph
+ ├─ krackhardt_kite_graph
+ ├─ moebius_kantor_graph
+ ├─ octahedral_graph
+ ├─ pappus_graph
+ ├─ petersen_graph
+ ├─ sedgewick_maze_graph
+ ├─ tetrahedral_graph
+ ├─ truncated_cube_graph
+ ├─ truncated_tetrahedron_graph
+ └─ tutte_graph
+social
+ ├─ davis_southern_women_graph
+ ├─ florentine_families_graph
+ ├─ karate_club_graph
+ └─ les_miserables_graph
+
### Other -``` -convert_matrix - ├─ from_pandas_edgelist - └─ from_scipy_sparse_array -``` +
+convert_matrix
+ ├─ from_pandas_edgelist
+ └─ from_scipy_sparse_array
+
To request nx-cugraph backend support for a NetworkX API that is not listed above, visit the [cuGraph GitHub repo](https://github.com/rapidsai/cugraph). diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml index 8e87fc23592..fdd24861da7 100644 --- a/python/nx-cugraph/lint.yaml +++ b/python/nx-cugraph/lint.yaml @@ -31,7 +31,7 @@ repos: - id: validate-pyproject name: Validate pyproject.toml - repo: https://github.com/PyCQA/autoflake - rev: v2.2.1 + rev: v2.3.0 hooks: - id: autoflake args: [--in-place] @@ -40,17 +40,17 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.15.0 + rev: v3.15.1 hooks: - id: pyupgrade args: [--py39-plus] - repo: https://github.com/psf/black - rev: 23.12.1 + rev: 24.2.0 hooks: - id: black # - id: black-jupyter - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.14 + rev: v0.2.2 hooks: - id: ruff args: [--fix-only, --show-fixes] # --unsafe-fixes] @@ -62,7 +62,7 @@ repos: additional_dependencies: &flake8_dependencies # These versions need updated manually - flake8==7.0.0 - - flake8-bugbear==24.1.17 + - flake8-bugbear==24.2.6 - flake8-simplify==0.21.0 - repo: https://github.com/asottile/yesqa rev: v1.5.0 @@ -77,7 +77,7 @@ repos: additional_dependencies: [tomli] files: ^(nx_cugraph|docs)/ - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.14 + rev: v0.2.2 hooks: - id: ruff - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py index 9e723624a3b..e61a931c069 100644 --- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py +++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py @@ -66,9 +66,9 @@ def hits( resource_handle=plc.ResourceHandle(), graph=G._get_plc_graph(weight, 1, dtype, store_transposed=True), tol=tol, - initial_hubs_guess_vertices=None - if nstart is None - else cp.arange(N, dtype=index_dtype), + initial_hubs_guess_vertices=( + None if nstart is None else cp.arange(N, dtype=index_dtype) + ), initial_hubs_guess_values=nstart, max_iter=max_iter, normalized=normalized, diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py index 55fcc3e520a..40224e91d57 100644 --- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py +++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py @@ -78,9 +78,9 @@ def pagerank( "graph": G._get_plc_graph(weight, 1, dtype, store_transposed=True), "precomputed_vertex_out_weight_vertices": None, "precomputed_vertex_out_weight_sums": None, - "initial_guess_vertices": None - if nstart is None - else cp.arange(N, dtype=index_dtype), + "initial_guess_vertices": ( + None if nstart is None else cp.arange(N, dtype=index_dtype) + ), "initial_guess_values": nstart, "alpha": alpha, "epsilon": N * tol, diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py index 0951ee6b135..5132e6a547b 100644 --- a/python/nx-cugraph/nx_cugraph/classes/graph.py +++ b/python/nx-cugraph/nx_cugraph/classes/graph.py @@ -634,8 +634,7 @@ def _get_plc_graph( "pylibcugraph only supports float16 and float32 dtypes." ) elif ( - edge_array.dtype == np.uint64 - and edge_array.max().tolist() > 2**53 + edge_array.dtype == np.uint64 and edge_array.max().tolist() > 2**53 ): raise ValueError( f"Integer value of value is too large (> 2**53): {val}; " diff --git a/python/nx-cugraph/nx_cugraph/classes/multigraph.py b/python/nx-cugraph/nx_cugraph/classes/multigraph.py index fb787369e58..de58474de70 100644 --- a/python/nx-cugraph/nx_cugraph/classes/multigraph.py +++ b/python/nx-cugraph/nx_cugraph/classes/multigraph.py @@ -360,9 +360,7 @@ def get_edge_data( if k not in self.edge_masks or self.edge_masks[k][index] } return { - edge_keys[index] - if edge_keys is not None - else index: { + edge_keys[index] if edge_keys is not None else index: { k: v[index].tolist() for k, v in self.edge_values.items() if k not in self.edge_masks or self.edge_masks[k][index] diff --git a/python/nx-cugraph/nx_cugraph/interface.py b/python/nx-cugraph/nx_cugraph/interface.py index 3c62fc3628e..d044ba6960d 100644 --- a/python/nx-cugraph/nx_cugraph/interface.py +++ b/python/nx-cugraph/nx_cugraph/interface.py @@ -220,9 +220,9 @@ def key(testpath): ) if sys.version_info[:2] == (3, 9): # This test is sensitive to RNG, which depends on Python version - xfail[ - key("test_louvain.py:test_threshold") - ] = "Louvain does not support seed parameter" + xfail[key("test_louvain.py:test_threshold")] = ( + "Louvain does not support seed parameter" + ) if nxver.major == 3 and nxver.minor >= 2: xfail.update( { diff --git a/python/nx-cugraph/nx_cugraph/scripts/print_table.py b/python/nx-cugraph/nx_cugraph/scripts/print_table.py index 117a1444f48..7c90281247c 100755 --- a/python/nx-cugraph/nx_cugraph/scripts/print_table.py +++ b/python/nx-cugraph/nx_cugraph/scripts/print_table.py @@ -61,7 +61,8 @@ def main(path_to_info=None, *, file=sys.stdout): lines = ["networkx_path,dispatch_name,version_added,plc,is_incomplete,is_different"] lines.extend(",".join(map(str, info)) for info in path_to_info.values()) text = "\n".join(lines) - print(text, file=file) + if file is not None: + print(text, file=file) return text diff --git a/python/nx-cugraph/nx_cugraph/scripts/print_tree.py b/python/nx-cugraph/nx_cugraph/scripts/print_tree.py index 485873a447d..fbb1c3dd0c5 100755 --- a/python/nx-cugraph/nx_cugraph/scripts/print_tree.py +++ b/python/nx-cugraph/nx_cugraph/scripts/print_tree.py @@ -12,29 +12,58 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import re import sys -import networkx as nx +from nx_cugraph.scripts.print_table import Info, get_path_to_info -from nx_cugraph.scripts.print_table import get_path_to_info +def assoc_in(d, keys, value): + """Like Clojure's assoc-in, but modifies d in-place.""" + inner = d + keys = iter(keys) + key = next(keys) + for next_key in keys: + if key not in inner: + inner[key] = {} + inner = inner[key] + key = next_key + inner[key] = value + return d -def add_branch(G, path, extra="", *, skip=0): - branch = path.split(".") - prev = ".".join(branch[: skip + 1]) - for i in range(skip + 2, len(branch)): - cur = ".".join(branch[:i]) - G.add_edge(prev, cur) - prev = cur - if extra: - if not isinstance(extra, str): - extra = ", ".join(extra) - path += f" ({extra})" - G.add_edge(prev, path) + +def default_get_payload_internal(keys): + return keys[-1] + + +def tree_lines( + tree, + parents=(), + are_levels_closing=(), + get_payload_internal=default_get_payload_internal, +): + pre = "".join( + " " if is_level_closing else " │ " + for is_level_closing in are_levels_closing + ) + c = "├" + are_levels_closing += (False,) + for i, (key, val) in enumerate(tree.items(), 1): + if i == len(tree): # Last item + c = "└" + are_levels_closing = are_levels_closing[:-1] + (True,) + if isinstance(val, str): + yield pre + f" {c}─ " + val + else: + yield pre + f" {c}─ " + get_payload_internal((*parents, key)) + yield from tree_lines( + val, + (*parents, key), + are_levels_closing, + get_payload_internal=get_payload_internal, + ) -def get_extra( +def get_payload( info, *, networkx_path=False, @@ -64,7 +93,10 @@ def get_extra( extra.append("is-incomplete") if different and info.is_different: extra.append("is-different") - return extra + extra = ", ".join(extra) + if extra: + extra = f" ({extra})" + return info.networkx_path.rsplit(".", 1)[-1] + extra def create_tree( @@ -80,12 +112,20 @@ def create_tree( incomplete=False, different=False, prefix="", + strip_networkx=True, + get_payload=get_payload, ): if path_to_info is None: path_to_info = get_path_to_info() + if strip_networkx: + path_to_info = { + key: Info(info.networkx_path.replace("networkx.", "", 1), *info[1:]) + for key, info in path_to_info.items() + } if isinstance(by, str): by = [by] - G = nx.DiGraph() + # We rely on the fact that dicts maintain order + tree = {} for info in sorted( path_to_info.values(), key=lambda x: (*(getattr(x, b) for b in by), x.networkx_path), @@ -93,7 +133,7 @@ def create_tree( if not all(getattr(info, b) for b in by): continue path = prefix + ".".join(getattr(info, b) for b in by) - extra = get_extra( + payload = get_payload( info, networkx_path=networkx_path, dispatch_name=dispatch_name, @@ -103,8 +143,8 @@ def create_tree( incomplete=incomplete, different=different, ) - add_branch(G, path, extra=extra, skip=skip) - return G + assoc_in(tree, path.split("."), payload) + return tree def main( @@ -132,45 +172,33 @@ def main( "different": different, } if by == "networkx_path": - G = create_tree(path_to_info, by="networkx_path", **kwargs) - text = re.sub( - r" [A-Za-z_\./]+\.", " ", ("\n".join(nx.generate_network_text(G))) - ) + tree = create_tree(path_to_info, by="networkx_path", **kwargs) + text = "\n".join(tree_lines(tree)) elif by == "plc": - G = create_tree( - path_to_info, by=["plc", "networkx_path"], prefix="plc-", **kwargs - ) - text = re.sub( - "plc-", - "plc.", - re.sub( - r" plc-[A-Za-z_\./]*\.", - " ", - "\n".join(nx.generate_network_text(G)), - ), + tree = create_tree( + path_to_info, + by=["plc", "networkx_path"], + prefix="plc-", + **kwargs, ) + text = "\n".join(tree_lines(tree)).replace("plc-", "plc.") elif by == "version_added": - G = create_tree( + tree = create_tree( path_to_info, by=["version_added", "networkx_path"], prefix="version_added-", **kwargs, ) - text = re.sub( - "version_added-", - "version: ", - re.sub( - r" version_added-[-0-9A-Za-z_\./]*\.", - " ", - "\n".join(nx.generate_network_text(G)), - ), - ).replace("-", ".") + text = "\n".join(tree_lines(tree)).replace("version_added-", "version: ") + for digit in "0123456789": + text = text.replace(f"2{digit}-", f"2{digit}.") else: raise ValueError( "`by` argument should be one of {'networkx_path', 'plc', 'version_added' " f"got: {by}" ) - print(text, file=file) + if file is not None: + print(text, file=file) return text diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml index a7525530ac8..60a4b5589d4 100644 --- a/python/nx-cugraph/pyproject.toml +++ b/python/nx-cugraph/pyproject.toml @@ -149,8 +149,10 @@ exclude_lines = [ # https://github.com/charliermarsh/ruff/ line-length = 88 target-version = "py39" +[tool.ruff.lint] unfixable = [ "F841", # unused-variable (Note: can leave useless expression) + "B905", # zip-without-explicit-strict (Note: prefer `zip(x, y, strict=True)`) ] select = [ "ALL", @@ -172,7 +174,6 @@ ignore = [ # "SIM401", # Use dict.get ... instead of if-else-block (Note: if-else better for coverage and sometimes clearer) # "TRY004", # Prefer `TypeError` exception for invalid type (Note: good advice, but not worth the nuisance) "B904", # Bare `raise` inside exception clause (like TRY200; sometimes okay) - "TRY200", # Use `raise from` to specify exception cause (Note: sometimes okay to raise original exception) # Intentionally ignored "A003", # Class attribute ... is shadowing a python builtin @@ -224,22 +225,22 @@ ignore = [ "PD", # pandas-vet (Intended for scripts that use pandas, not libraries) ] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401"] # Allow unused imports (w/o defining `__all__`) # Allow assert, print, RNG, and no docstring "nx_cugraph/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"] "_nx_cugraph/__init__.py" = ["E501"] "nx_cugraph/algorithms/**/*py" = ["D205", "D401"] # Allow flexible docstrings for algorithms -[tool.ruff.flake8-annotations] +[tool.ruff.lint.flake8-annotations] mypy-init-return = true -[tool.ruff.flake8-builtins] +[tool.ruff.lint.flake8-builtins] builtins-ignorelist = ["copyright"] -[tool.ruff.flake8-pytest-style] +[tool.ruff.lint.flake8-pytest-style] fixture-parentheses = false mark-parentheses = false -[tool.ruff.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "numpy" diff --git a/python/nx-cugraph/scripts/update_readme.py b/python/nx-cugraph/scripts/update_readme.py new file mode 100644 index 00000000000..1ab5a76c4c0 --- /dev/null +++ b/python/nx-cugraph/scripts/update_readme.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python +# Copyright (c) 2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import re +import zlib +from collections import namedtuple +from pathlib import Path +from warnings import warn + +from nx_cugraph.scripts.print_tree import create_tree, tree_lines + +# See: https://sphobjinv.readthedocs.io/en/stable/syntax.html +DocObject = namedtuple( + "DocObject", + "name, domain, role, priority, uri, displayname", +) + + +def parse_docobject(line): + left, right = line.split(":") + name, domain = left.rsplit(" ", 1) + role, priority, uri, displayname = right.split(" ", 3) + if displayname == "-": + displayname = name + if uri.endswith("$"): + uri = uri[:-1] + name + return DocObject(name, domain, role, priority, uri, displayname) + + +def replace_body(text, match, new_body): + start, stop = match.span("body") + return text[:start] + new_body + text[stop:] + + +# NetworkX isn't perfectly intersphinx-compatible, so manually specify some urls. +# See: https://github.com/networkx/networkx/issues/7278 +MANUAL_OBJECT_URLS = { + "networkx.algorithms.centrality.betweenness": ( + "https://networkx.org/documentation/stable/reference/" + "algorithms/centrality.html#shortest-path-betweenness" + ), + "networkx.algorithms.centrality.degree_alg": ( + "https://networkx.org/documentation/stable/reference/" + "algorithms/centrality.html#degree" + ), + "networkx.algorithms.centrality.eigenvector": ( + "https://networkx.org/documentation/stable/reference/" + "algorithms/centrality.html#eigenvector" + ), + "networkx.algorithms.centrality.katz": ( + "https://networkx.org/documentation/stable/reference/" + "algorithms/centrality.html#eigenvector" + ), + "networkx.algorithms.components.connected": ( + "https://networkx.org/documentation/stable/reference/" + "algorithms/component.html#connectivity" + ), + "networkx.algorithms.components.weakly_connected": ( + "https://networkx.org/documentation/stable/reference/" + "algorithms/component.html#weak-connectivity" + ), +} + + +def main(readme_file, objects_filename): + """``readme_file`` must be readable and writable, so use mode ``"a+"``""" + # Use the `objects.inv` file to determine URLs. For details about this file, see: + # https://sphobjinv.readthedocs.io/en/stable/syntax.html + # We might be better off using a library like that, but roll our own for now. + with Path(objects_filename).open("rb") as objects_file: + line = objects_file.readline() + if line != b"# Sphinx inventory version 2\n": + raise RuntimeError(f"Bad line in objects.inv:\n\n{line}") + line = objects_file.readline() + if line != b"# Project: NetworkX\n": + raise RuntimeError(f"Bad line in objects.inv:\n\n{line}") + line = objects_file.readline() + if not line.startswith(b"# Version: "): + raise RuntimeError(f"Bad line in objects.inv:\n\n{line}") + line = objects_file.readline() + if line != b"# The remainder of this file is compressed using zlib.\n": + raise RuntimeError(f"Bad line in objects.inv:\n\n{line}") + zlib_data = objects_file.read() + objects_text = zlib.decompress(zlib_data).decode().strip() + objects_list = [parse_docobject(line) for line in objects_text.split("\n")] + doc_urls = { + obj.name: "https://networkx.org/documentation/stable/" + obj.uri + for obj in objects_list + } + if len(objects_list) != len(doc_urls): + raise RuntimeError("Oops; duplicate names found in objects.inv") + + def get_payload(info, **kwargs): + path = "networkx." + info.networkx_path + subpath, name = path.rsplit(".", 1) + # Many objects are referred to in modules above where they are defined. + while subpath: + path = f"{subpath}.{name}" + if path in doc_urls: + return f'{name}' + subpath = subpath.rsplit(".", 1)[0] + warn(f"Unable to find URL for {name!r}: {path}", stacklevel=0) + return name + + def get_payload_internal(keys): + path = "networkx." + ".".join(keys) + name = keys[-1] + if path in doc_urls: + return f'{name}' + path2 = "reference/" + "/".join(keys) + if path2 in doc_urls: + return f'{name}' + if path in MANUAL_OBJECT_URLS: + return f'{name}' + warn(f"Unable to find URL for {name!r}: {path}", stacklevel=0) + return name + + readme_file.seek(0) + text = readme_file.read() + tree = create_tree(get_payload=get_payload) + # Algorithms + match = re.search( + r"### .Algorithms(?P.*?)
\n(?P.*?)\n
", + text, + re.DOTALL, + ) + if not match: + raise RuntimeError("Algorithms section not found!") + lines = [] + for key, val in tree["algorithms"].items(): + lines.append(get_payload_internal(("algorithms", key))) + lines.extend( + tree_lines( + val, + parents=("algorithms", key), + get_payload_internal=get_payload_internal, + ) + ) + text = replace_body(text, match, "\n".join(lines)) + # Generators + match = re.search( + r"### .Generators(?P.*?)
\n(?P.*?)\n
", + text, + re.DOTALL, + ) + if not match: + raise RuntimeError("Generators section not found!") + lines = [] + for key, val in tree["generators"].items(): + lines.append(get_payload_internal(("generators", key))) + lines.extend( + tree_lines( + val, + parents=("generators", key), + get_payload_internal=get_payload_internal, + ) + ) + text = replace_body(text, match, "\n".join(lines)) + # Other + match = re.search( + r"### Other\n(?P.*?)
\n(?P.*?)\n
", + text, + re.DOTALL, + ) + if not match: + raise RuntimeError("Other section not found!") + lines = [] + for key, val in tree.items(): + if key in {"algorithms", "generators"}: + continue + lines.append(get_payload_internal((key,))) + lines.extend( + tree_lines(val, parents=(key,), get_payload_internal=get_payload_internal) + ) + text = replace_body(text, match, "\n".join(lines)) + # Now overwrite README.md + readme_file.truncate(0) + readme_file.write(text) + return text + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + "Update README.md to show NetworkX functions implemented by nx-cugraph" + ) + parser.add_argument("readme_filename", help="Path to the README.md file") + parser.add_argument( + "networkx_objects", help="Path to the objects.inv file from networkx docs" + ) + args = parser.parse_args() + with Path(args.readme_filename).open("a+") as readme_file: + main(readme_file, args.networkx_objects) From ac65b17ee3e9b85368f266da1a6a3b8e5717e292 Mon Sep 17 00:00:00 2001 From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> Date: Wed, 28 Feb 2024 12:28:38 -0600 Subject: [PATCH 2/2] Adds additional detail to Jaccard notebook (#4189) Followup to PR #4176 to add additional detail to the Jaccard notebook. * Adds revision history to bottom * Adds more detail to Jaccard description * Adds cell output * Adds example of using the `vertex_pair` arg Authors: - Rick Ratzel (https://github.com/rlratzel) Approvers: - Don Acosta (https://github.com/acostadon) - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/cugraph/pull/4189 --- .../link_prediction/Jaccard-Similarity.ipynb | 380 +++++++++++++++++- 1 file changed, 364 insertions(+), 16 deletions(-) diff --git a/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb b/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb index b5f09c0c145..86bb4d17c22 100755 --- a/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb +++ b/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb @@ -8,7 +8,12 @@ "# Jaccard Similarity\n", "----\n", "\n", - "In this notebook we will explore the Jaccard vertex similarity metrics available in cuGraph." + "In this notebook we will explore the Jaccard vertex similarity metrics available in cuGraph.\n", + "\n", + "cuGraph supports Jaccard similarity for both unweighted and weighted graphs, but this notebook \n", + "will demonstrate Jaccard similarity only on unweighted graphs. A future update will include an \n", + "example using a graph with edge weights, where the weights are used to influence the Jaccard \n", + "similarity coefficients." ] }, { @@ -18,30 +23,48 @@ "source": [ "## Introduction\n", "\n", - "The Jaccard similarity between two sets is defined as the ratio of the volume of their intersection divided by the volume of their union. \n", + "The Jaccard similarity between two sets is defined as the ratio of the volume of their intersection \n", + "divided by the volume of their union, where the sets used are the sets of neighboring vertices for each \n", + "vertex.\n", + "\n", + "The neighbors of a vertex, _v_, is defined as the set, _U_, of vertices connected by way of an edge to vertex v, or _N(v) = {U} where v ∈ V and ∀ u ∈ U ∃ edge(v,u)∈ E_.\n", "\n", - "The Jaccard Similarity can then be expressed as\n", + "If we then let set __A__ be the set of neighbors for vertex _a_, and set __B__ be the set of neighbors for vertex _b_, then the Jaccard Similarity for the vertex pair _(a, b)_ can be expressed as\n", "\n", "$\\text{Jaccard similarity} = \\frac{|A \\cap B|}{|A \\cup B|}$\n", "\n", "\n", - "To compute the Jaccard similarity between all pairs of vertices connected by an edge in cuGraph use:
\n", - "__df = cugraph.jaccard(G)__\n", + "cuGraph's Jaccard function will, by default, compute the Jaccard similarity coefficient for every pair of \n", + "vertices in the two-hop neighborhood for every vertex.\n", + "\n", + "```df = cugraph.jaccard(G, vertex_pair=None)```\n", + "\n", + "Parameters:\n", "\n", " G: A cugraph.Graph object\n", "\n", + " vertex_pair: cudf.DataFrame, optional (default=None)\n", + " A GPU dataframe consisting of two columns representing pairs of\n", + " vertices. If provided, the jaccard coefficient is computed for the\n", + " given vertex pairs. If the vertex_pair is not provided then the\n", + " current implementation computes the jaccard coefficient for all\n", + " adjacent vertices in the graph.\n", + "\n", "Returns:\n", "\n", " df: cudf.DataFrame with three columns:\n", " df[\"first\"]: The first vertex id of each pair.\n", " df[\"second\"]: The second vertex id of each pair.\n", " df[\"jaccard_coeff\"]: The jaccard coefficient computed between the vertex pairs.\n", - "
\n", + "\n", + "To limit the computation to specific vertex pairs, including those not in the same two-hop \n", + "neighborhood, pass a `vertex_pair` value (see example below).\n", "\n", "__References__ \n", "- https://research.nvidia.com/publication/2017-11_Parallel-Jaccard-and \n", "\n", "__Additional Reading__ \n", + "- [Intro to Graph Analysis using cuGraph: Similarity Algorithms](https://medium.com/rapids-ai/intro-to-graph-analysis-using-cugraph-similarity-algorithms-64fa923791ac)\n", "- [Wikipedia: Jaccard](https://en.wikipedia.org/wiki/Jaccard_index)\n" ] }, @@ -71,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "scrolled": true }, @@ -96,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -115,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -134,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -147,9 +170,189 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
firstsecondjaccard_coeff
54114151.000000
54214181.000000
54314201.000000
54414221.000000
56115181.000000
56215201.000000
56315221.000000
58717211.000000
60518201.000000
60618221.000000
62520221.000000
2997130.800000
2856100.750000
388450.750000
44319210.666667
5029280.666667
58417190.666667
22313190.600000
4532330.526316
3107120.500000
\n", + "
" + ], + "text/plain": [ + " first second jaccard_coeff\n", + "541 14 15 1.000000\n", + "542 14 18 1.000000\n", + "543 14 20 1.000000\n", + "544 14 22 1.000000\n", + "561 15 18 1.000000\n", + "562 15 20 1.000000\n", + "563 15 22 1.000000\n", + "587 17 21 1.000000\n", + "605 18 20 1.000000\n", + "606 18 22 1.000000\n", + "625 20 22 1.000000\n", + "299 7 13 0.800000\n", + "285 6 10 0.750000\n", + "388 4 5 0.750000\n", + "443 19 21 0.666667\n", + "502 9 28 0.666667\n", + "584 17 19 0.666667\n", + "223 13 19 0.600000\n", + "45 32 33 0.526316\n", + "310 7 12 0.500000" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Show the top-20 most similar vertices.\n", "jaccard_coeffs.head(20)" @@ -169,15 +372,63 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We have to specify vertices in a DataFrame to see their similarity if they\n", - "are not part of the same two-hop neighborhood." + "If we want to see the similarity of a pair of vertices that are not part of \n", + "the same two-hop neighborhood, we have to specify them in a `cudf.DataFrame` \n", + "to pass to the `jaccard` call." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
firstsecondjaccard_coeff
016330.0
\n", + "
" + ], + "text/plain": [ + " first second jaccard_coeff\n", + "0 16 33 0.0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cugraph.jaccard(G, cudf.DataFrame([(16, 33)]))" ] @@ -191,6 +442,88 @@ "neighbors." ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the `cudf.DataFrame` argument to pass in any number of specific vertex pairs \n", + "to compute the similarity for, regardless of whether or not they're included by default. \n", + "This is useful to limit the computation and result size when only specific vertex \n", + "similarities are needed." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
firstsecondjaccard_coeff
016330.000000
132330.526316
20230.000000
\n", + "
" + ], + "text/plain": [ + " first second jaccard_coeff\n", + "0 16 33 0.000000\n", + "1 32 33 0.526316\n", + "2 0 23 0.000000" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pairs = cudf.DataFrame([(16, 33), (32, 33), (0, 23)])\n", + "cugraph.jaccard(G, pairs)" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -206,6 +539,21 @@ "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n", "___" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Revision History\n", + "\n", + "| Author | Date | Update | cuGraph Version | Test Hardware |\n", + "| --------------|------------|------------------|-----------------|---------------------------|\n", + "| Brad Rees | 10/14/2019 | created | 0.14 | GV100 32 GB, CUDA 10.2 |\n", + "| Don Acosta | 07/20/2022 | tested/updated | 22.08 nightly | DGX Tesla V100, CUDA 11.5 |\n", + "| Ralph Liu | 06/29/2023 | updated | 23.08 nightly | DGX Tesla V100, CUDA 12.0 |\n", + "| Rick Ratzel | 02/23/2024 | tested/updated | 24.04 nightly | DGX Tesla V100, CUDA 12.0 |" + ] } ], "metadata": {