diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 494565c95d..011659e0a6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,6 +54,8 @@ jobs: ./bin/lint.sh - name: Type check + env: + PYTHON_VERSION: ${{ matrix.python-version }} run: | source pygraphistry/bin/activate ./bin/typecheck.sh @@ -101,6 +103,8 @@ jobs: ./bin/lint.sh - name: Type check + env: + PYTHON_VERSION: ${{ matrix.python-version }} run: | source pygraphistry/bin/activate ./bin/typecheck.sh @@ -143,6 +147,8 @@ jobs: python -m pip install -e .[test,pygraphviz] - name: Type check + env: + PYTHON_VERSION: ${{ matrix.python-version }} run: | source pygraphistry/bin/activate ./bin/typecheck.sh @@ -159,8 +165,7 @@ jobs: strategy: matrix: - #python-version: [3.8, 3.9, '3.10', 3.11, 3.12] - python-version: [3.8, 3.9] + python-version: [3.9, '3.10', 3.11, 3.12] steps: @@ -185,6 +190,8 @@ jobs: python -m pip install -e .[test,testai,umap-learn] - name: Type check + env: + PYTHON_VERSION: ${{ matrix.python-version }} run: | source pygraphistry/bin/activate ./bin/typecheck.sh @@ -206,8 +213,7 @@ jobs: strategy: matrix: - python-version: [3.8, 3.9] - #python-version: [3.8, 3.9, '3.10', 3.11, 3.12] + python-version: [3.9, '3.10', 3.11, 3.12] #include: # - python-version: 3.12 # continue-on-error: true @@ -233,7 +239,7 @@ jobs: source pygraphistry/bin/activate python -m pip install --upgrade pip python -m pip install -e .[test,testai,ai] - echo "dirty-cat: `pip show dirty-cat | grep Version`" + echo "skrub: `pip show skrub | grep Version`" echo "pandas: `pip show pandas | grep Version`" echo "numpy: `pip show numpy | grep Version`" echo "scikit-learn: `pip show scikit-learn | grep Version`" @@ -241,6 +247,8 @@ jobs: echo "umap-learn: `pip show umap-learn | grep Version`" - name: Type check + env: + PYTHON_VERSION: ${{ matrix.python-version }} run: | source pygraphistry/bin/activate ./bin/typecheck.sh @@ -270,6 +278,11 @@ jobs: source pygraphistry/bin/activate ./bin/test-embed.sh + - name: Full DGL tests (rich featurize) + run: | + source pygraphistry/bin/activate + ./bin/test-dgl.sh + test-neo4j: diff --git a/CHANGELOG.md b/CHANGELOG.md index e24dc4d93c..34582bd845 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,43 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Development] +## [0.36.0 - 2025-02-05] + +### Breaking + +* `from_cugraph` returns using the src/dst bindings of `cugraph.Graph` object instead of base `Plottable` +* `pip install graphistry[umap-learn]` and `pip install graphistry[ai]` are now Python 3.9+ (was 3.8+) +* `Plottable`'s fields `_node_dbscan` / `_edge_dbscan` are now `_dbscan_nodes` / `_dbscan_edges` + +### Feat + +* Switch to `skrub` for feature engineering +* More AI methods support GPU path +* Support cugraph 26.10+, numpy 2.0+ +* Add more umap, dbscan fields to `Plottable` + +### Infra + +* `[umap-learn]` + `[ai]` unpin deps - scikit, scipy, torch (now 2), etc + +### Refactor + +* Move more type models to models/compute/{feature,umap,cluster} +* Turn more print => logger + +### Fixes + +* Remove lint/type ignores and fix root causes + +### Tests + +* Stop ignoring warnings in featurize and umap +* python version tests use corresponding python version for mypy +* ci umap tests: py 3.8, 3.9 => 3.9..3.12 +* ci ai tests: py 3.8, 3.9 => 3.9..3.12 +* ci tests dgl +* plugin tests check for module imports + ## [0.35.10 - 2025-01-24] ### Fixes: diff --git a/bin/test-dgl.sh b/bin/test-dgl.sh old mode 100644 new mode 100755 index e69de29bb2..41d2fb0be7 --- a/bin/test-dgl.sh +++ b/bin/test-dgl.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -ex + +# Run from project root +# - Args get passed to pytest phase +# Non-zero exit code on fail + +# Assume [umap-learn,test] + +python -m pytest --version + +python -B -m pytest -vv \ + graphistry/tests/test_dgl_utils.py diff --git a/bin/typecheck.sh b/bin/typecheck.sh index ee7f148750..b3cfdc13e7 100755 --- a/bin/typecheck.sh +++ b/bin/typecheck.sh @@ -6,5 +6,9 @@ set -ex mypy --version -# Check core -mypy --config-file mypy.ini graphistry +if [ -n "$PYTHON_VERSION" ]; then + SHORT_VERSION=$(echo "$PYTHON_VERSION" | cut -d. -f1,2) + mypy --python-version "$SHORT_VERSION" --config-file mypy.ini graphistry +else + mypy --config-file mypy.ini graphistry +fi diff --git a/docker/test-cpu-umap-ai.sh b/docker/test-cpu-umap-ai.sh index 9a95ed3f6b..8e0adbcd81 100755 --- a/docker/test-cpu-umap-ai.sh +++ b/docker/test-cpu-umap-ai.sh @@ -2,7 +2,9 @@ set -ex -PYTHON_VERSION=${PYTHON_VERSION:-3.8} \ +TEST_FILES=${@:-"graphistry/tests/test_feature_utils.py graphistry/tests/test_umap_utils.py"} + +PYTHON_VERSION=${PYTHON_VERSION:-3.10} \ PIP_DEPS=${PIP_DEPS:--e .[ai,test,testai]} \ WITH_LINT=${WITH_LINT:-1} \ WITH_TYPECHECK=${WITH_TYPECHECK:-1} \ @@ -11,6 +13,4 @@ WITH_TEST=${WITH_TEST:-1} \ SENTENCE_TRANSFORMER=${SENTENCE_TRANSFORMER-average_word_embeddings_komninos} \ SENTENCE_TRANSFORMER=${SENTENCE_TRANSFORMER} \ ./test-cpu-local.sh \ - graphistry/tests/test_feature_utils.py \ - graphistry/tests/test_umap_utils.py \ - $@ + $TEST_FILES \ No newline at end of file diff --git a/docker/test-cpu-umap.sh b/docker/test-cpu-umap.sh index a48f67f56f..e5b1d1e217 100755 --- a/docker/test-cpu-umap.sh +++ b/docker/test-cpu-umap.sh @@ -2,7 +2,9 @@ set -ex -PYTHON_VERSION=${PYTHON_VERSION:-3.8} \ +TEST_FILES=${@:-"graphistry/tests/test_feature_utils.py graphistry/tests/test_umap_utils.py"} + +PYTHON_VERSION=${PYTHON_VERSION:-3.9} \ PIP_DEPS=${PIP_DEPS:--e .[umap-learn,test,testai]} \ WITH_LINT=${WITH_LINT:-1} \ WITH_TYPECHECK=${WITH_TYPECHECK:-1} \ @@ -11,6 +13,4 @@ WITH_TEST=${WITH_TEST:-1} \ SENTENCE_TRANSFORMER=${SENTENCE_TRANSFORMER-average_word_embeddings_komninos} \ SENTENCE_TRANSFORMER=${SENTENCE_TRANSFORMER} \ ./test-cpu-local.sh \ - graphistry/tests/test_feature_utils.py \ - graphistry/tests/test_umap_utils.py \ - $@ + $TEST_FILES \ No newline at end of file diff --git a/docker/test-gpu-local.sh b/docker/test-gpu-local.sh index e99a75a7f6..b6df9b0a5d 100755 --- a/docker/test-gpu-local.sh +++ b/docker/test-gpu-local.sh @@ -47,5 +47,4 @@ docker run \ ${NETWORK} \ graphistry/test-gpu:${TEST_CPU_VERSION} \ --maxfail=1 \ - --ignore=graphistry/tests/test_feature_utils.py \ $@ diff --git a/docs/source/conf.py b/docs/source/conf.py index 83684afd21..12a0f4a7e0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -239,10 +239,10 @@ ('py:class', 'torch'), ('py:class', 'umap'), ('py:class', 'sentence_transformers'), - ('py:class', 'dirty_cat'), ('py:class', 'sklearn'), ('py:class', 'scipy'), ('py:class', 'seaborn'), + ('py:class', 'skrub'), ('py:class', 'annoy'), ('py:class', 'NetworkX graph'), ('py:class', 'Pandas dataframe'), diff --git a/graphistry/Engine.py b/graphistry/Engine.py index 1f962b992b..d2cd2943ea 100644 --- a/graphistry/Engine.py +++ b/graphistry/Engine.py @@ -1,9 +1,9 @@ from inspect import getmodule +import warnings import numpy as np import pandas as pd from typing import Any, Optional, Union from enum import Enum -from graphistry.utils.lazy_import import lazy_cudf_import class Engine(Enum): @@ -29,6 +29,8 @@ def resolve_engine( g_or_df: Optional[Any] = None, ) -> Engine: + from graphistry.utils.lazy_import import lazy_cudf_import + if isinstance(engine, str): engine = EngineAbstract(engine) @@ -42,7 +44,8 @@ def resolve_engine( if isinstance(g_or_df, Plottable): if g_or_df._nodes is not None and g_or_df._edges is not None: if not isinstance(g_or_df._nodes, type(g_or_df._edges)): - raise ValueError(f'Edges and nodes must be same type for auto engine selection, got: {type(g_or_df._edges)} and {type(g_or_df._nodes)}') + #raise ValueError(f'Edges and nodes must be same type for auto engine selection, got: {type(g_or_df._edges)} and {type(g_or_df._nodes)}') + warnings.warn(f'Edges and nodes must be same type for auto engine selection, got: {type(g_or_df._edges)} and {type(g_or_df._nodes)}') g_or_df = g_or_df._edges if g_or_df._edges is not None else g_or_df._nodes if g_or_df is not None: diff --git a/graphistry/Plottable.py b/graphistry/Plottable.py index 9a978d4c81..335aa3951a 100644 --- a/graphistry/Plottable.py +++ b/graphistry/Plottable.py @@ -2,7 +2,10 @@ from typing_extensions import Literal import pandas as pd +from graphistry.models.ModelDict import ModelDict from graphistry.models.compute.chain_remote import FormatType, OutputTypeAll, OutputTypeDf, OutputTypeGraph +from graphistry.models.compute.dbscan import DBSCANEngine +from graphistry.models.compute.umap import UMAPEngineConcrete from graphistry.plugins_types.cugraph_types import CuGraphKind from graphistry.Engine import Engine, EngineAbstract from graphistry.utils.json import JSONVal @@ -72,11 +75,13 @@ class Plottable(object): _node_embedding : Optional[pd.DataFrame] _node_encoder : Optional[Any] _node_features : Optional[pd.DataFrame] + _node_features_raw: Optional[pd.DataFrame] _node_target : Optional[pd.DataFrame] _edge_embedding : Optional[pd.DataFrame] _edge_encoder : Optional[Any] _edge_features : Optional[pd.DataFrame] + _edge_features_raw: Optional[pd.DataFrame] _edge_target : Optional[pd.DataFrame] _weighted_adjacency: Optional[Any] @@ -88,10 +93,27 @@ class Plottable(object): _xy: Optional[pd.DataFrame] _umap : Optional[UMAP] - _umap_params: Optional[Dict[str, Any]] + _umap_engine: Optional[UMAPEngineConcrete] + _umap_params: Optional[Union[ModelDict, Dict[str, Any]]] _umap_fit_kwargs: Optional[Dict[str, Any]] _umap_transform_kwargs: Optional[Dict[str, Any]] + # extra umap + _n_components: int + _metric: str + _n_neighbors: int + _min_dist: float + _spread: float + _local_connectivity: int + _repulsion_strength: float + _negative_sample_rate: float + _suffix: str + + _dbscan_engine: Optional[DBSCANEngine] + _dbscan_params: Optional[ModelDict] + _dbscan_nodes: Optional[Any] # fit model + _dbscan_edges: Optional[Any] # fit model + _adjacency : Optional[Any] _entity_to_index : Optional[dict] _index_to_entity : Optional[dict] diff --git a/graphistry/PlotterBase.py b/graphistry/PlotterBase.py index 885acfc8aa..7a5773423b 100644 --- a/graphistry/PlotterBase.py +++ b/graphistry/PlotterBase.py @@ -203,10 +203,16 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: # the fit umap instance self._umap = None + self._umap_engine = None self._umap_params : Optional[Dict[str, Any]] = None self._umap_fit_kwargs : Optional[Dict[str, Any]] = None self._umap_transform_kwargs : Optional[Dict[str, Any]] = None + self._dbscan_engine = None + self._dbscan_params = None + self._dbscan_nodes = None # fit model + self._dbscan_edges = None # fit model + self._adjacency = None self._entity_to_index = None self._index_to_entity = None @@ -216,11 +222,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._use_feat: bool = False self._triplets: Optional[List] = None self._kg_embed_dim: int = 128 - - # Dbscan - self._node_dbscan = None # the fit dbscan instance - self._edge_dbscan = None - + # DGL self.DGL_graph = None # the DGL graph diff --git a/graphistry/ai_utils.py b/graphistry/ai_utils.py index fb1f537d35..9b74d086ba 100644 --- a/graphistry/ai_utils.py +++ b/graphistry/ai_utils.py @@ -1,7 +1,10 @@ +from inspect import getmodule +import warnings import pandas as pd import numpy as np import graphistry +from graphistry.Plottable import Plottable from .constants import DISTANCE, WEIGHT, BATCH from logging import getLogger @@ -242,18 +245,14 @@ def infer_graph( """ #enhanced = is_notebook() - print("-" * 50) if verbose else None - if infer_on_umap_embedding and emb is not None: X_previously_fit = res._node_embedding X_new = emb - print("Infering edges over UMAP embedding") if verbose else None + logger.debug("Infering edges over UMAP embedding") if verbose else None else: # can still be umap, but want to do the inference on the higher dimensional features X_previously_fit = res._node_features X_new = X - print("Infering edges over features embedding") if verbose else None - - print("-" * 45) if verbose else None + logger.debug("Infering edges over features embedding") if verbose else None FEATS = res._node_features if FEATS is None: @@ -306,15 +305,15 @@ def infer_graph( m, std = np.mean(mdists), np.std(mdists) logger.info(f"--Mean distance to existing nodes {m:.2f} +/- {std:.2f}") - print(f' Mean distance to existing nodes {m:.2f} +/- {std:.2f}') if verbose else None + logger.debug(f' Mean distance to existing nodes {m:.2f} +/- {std:.2f}') if verbose else None if eps == "auto": eps = np.min([np.abs(m - std), m]) logger.info( f"-epsilon = {eps:.2f} max distance threshold to be considered a neighbor" ) - print(f' Max distance threshold; epsilon = {eps:.2f}') if verbose else None + logger.debug(f' Max distance threshold; epsilon = {eps:.2f}') if verbose else None - print(f' Finding {n_neighbors} nearest neighbors') if verbose else None + logger.debug(f' Finding {n_neighbors} nearest neighbors') if verbose else None nn = [] for i, dist in enumerate(mdists): record_df = df.iloc[i, :] @@ -334,7 +333,7 @@ def infer_graph( old_nodes.append(this_ndf) #new_nodes.extend([record_df, this_ndf]) - print(f' {np.mean(nn):.2f} neighbors per node within epsilon {eps:.2f}') if verbose else None + logger.debug(f' {np.mean(nn):.2f} neighbors per node within epsilon {eps:.2f}') if verbose else None new_edges = pd.DataFrame(new_edges, columns=[src, dst, WEIGHT, BATCH]) @@ -342,13 +341,13 @@ def infer_graph( if len(old_edges): old_edges = pd.concat(old_edges, axis=0).assign(_batch=0) all_nodes = pd.concat([old_edges[src], old_edges[dst], new_edges[src], new_edges[dst]]).drop_duplicates() - print('', len(all_nodes), "nodes in new graph") if verbose else None + logger.debug('', len(all_nodes), "nodes in new graph") if verbose else None if sample: new_edges = pd.concat([new_edges, old_edges], axis=0).drop_duplicates() - print(' Sampled', len(old_edges.drop_duplicates()), 'previous old edges') if verbose else None + logger.debug(' Sampled', len(old_edges.drop_duplicates()), 'previous old edges') if verbose else None new_edges = new_edges.drop_duplicates() - print('', len(new_edges), 'total edges after dropping duplicates') if verbose else None + logger.debug('', len(new_edges), 'total edges after dropping duplicates') if verbose else None if len(old_nodes): old_nodes = pd.DataFrame(old_nodes) @@ -371,17 +370,16 @@ def infer_graph( new_features = pd.concat([X, FEATS.loc[old_nodes.index]], axis=0) new_nodes = pd.concat([df, old_nodes], axis=0) # append minibatch at top - print(" ** Final graph has", len(new_nodes), "nodes") if verbose else None - print(" - Batch has", len(df), "nodes") if verbose else None - print(" - Brought in", len(old_nodes), "nodes") if verbose else None + logger.debug(" ** Final graph has", len(new_nodes), "nodes") if verbose else None + logger.debug(" - Batch has", len(df), "nodes") if verbose else None + logger.debug(" - Brought in", len(old_nodes), "nodes") if verbose else None new_targets = pd.concat([y, Y.loc[old_nodes.index]]) if y is not None else Y - print("-" * 50) if verbose else None return hydrate_graph(res, new_nodes, new_edges, node, src, dst, new_emb, new_features, new_targets) -def infer_self_graph(res, +def infer_self_graph(res: Plottable, emb, X, y, df, infer_on_umap_embedding=False, eps="auto", n_neighbors=7, verbose=False, ): """ @@ -401,8 +399,29 @@ def infer_self_graph(res, graphistry Plottable object """ #enhanced = is_notebook() - - print("-" * 50) if verbose else None + + was_cudf = False + emb_orig = emb + X_orig = X + y_orig = y + if 'cudf' in str(getmodule(emb)) or 'cudf' in str(getmodule(X)): + warnings.warn("cudf not supported in this function, converting to pandas") + was_cudf = True + import cudf + if emb is not None and isinstance(emb, cudf.DataFrame): + emb = emb.to_pandas() + if X is not None and isinstance(X, cudf.DataFrame): + X = X.to_pandas() + if df is not None and isinstance(df, cudf.DataFrame): + df = df.to_pandas() + + #WIP + if 'cudf' in str(getmodule(emb)) or 'cudf' in str(getmodule(X)): + import cudf + import cupy as cp + ncp = cp + else: + ncp = np if infer_on_umap_embedding and emb is not None: X_previously_fit = emb @@ -413,8 +432,6 @@ def infer_self_graph(res, X_new = X print("Infering edges over features embedding") if verbose else None - print("-" * 45) if verbose else None - assert ( df.shape[0] == X.shape[0] ), "minibatches df and X must have same number of rows since f(df) = X" @@ -424,11 +441,11 @@ def infer_self_graph(res, ), "minibatches emb and X must have same number of rows since h(df) = emb" df = df.assign(x=emb.x, y=emb.y) # add x and y to df for graphistry instance else: # if umap has been fit, but only transforming over features, need to add x and y or breaks plot binds of res - df['x'] = np.random.random(df.shape[0]) - df['y'] = np.random.random(df.shape[0]) + df['x'] = ncp.random.random(df.shape[0]) + df['y'] = ncp.random.random(df.shape[0]) # if umap, need to add '_n' as node id to df, adding new indices to existing graph - numeric_indices = np.arange( + numeric_indices = ncp.arange( X_previously_fit.shape[0], dtype=np.float64 # this seems off but works ) @@ -451,16 +468,16 @@ def infer_self_graph(res, mdists.append(dist) m, std = np.mean(mdists), np.std(mdists) - logger.info(f"--Mean distance to existing nodes {m:.2f} +/- {std:.2f}") - print(f' Mean distance to existing nodes {m:.2f} +/- {std:.2f}') if verbose else None + logger.debug(f"--Mean distance to existing nodes {m:.2f} +/- {std:.2f}") + logger.debug(f' Mean distance to existing nodes {m:.2f} +/- {std:.2f}') if verbose else None if eps == "auto": eps = np.min([np.abs(m - std), m]) - logger.info( + logger.debug( f" epsilon = {eps:.2f} max distance threshold to be considered a neighbor" ) - print(f' Max distance threshold; epsilon = {eps:.2f}') if verbose else None + logger.debug(f' Max distance threshold; epsilon = {eps:.2f}') if verbose else None - print(f' Finding {n_neighbors} nearest neighbors') if verbose else None + logger.debug(f' Finding {n_neighbors} nearest neighbors') if verbose else None nn = [] for i, dist in enumerate(mdists): record_df = df.iloc[i, :] @@ -473,12 +490,19 @@ def infer_self_graph(res, new_edges.append([this_ndf[node], record_df[node], weight, 1]) old_nodes.append(this_ndf) - print(f' {np.mean(nn):.2f} neighbors per node within epsilon {eps:.2f}') if verbose else None + logger.debug(f' {np.mean(nn):.2f} neighbors per node within epsilon {eps:.2f}') if verbose else None - new_edges = pd.DataFrame(new_edges, columns=[src, dst, WEIGHT, BATCH]) - new_edges = new_edges.drop_duplicates() - print('', len(new_edges), 'total edges after dropping duplicates') if verbose else None - print(" ** Final graph has", len(df), "nodes") if verbose else None + new_edges_df = pd.DataFrame(new_edges, columns=[src, dst, WEIGHT, BATCH]) + new_edges_df = new_edges_df.drop_duplicates() + logger.debug('', len(new_edges_df), 'total edges after dropping duplicates') if verbose else None + logger.debug(" ** Final graph has", len(df), "nodes") if verbose else None # ######################################################### - print("-" * 50) if verbose else None - return hydrate_graph(res, df, new_edges, node, src, dst, emb, X, y) + + if was_cudf: + import cudf + if isinstance(df, pd.DataFrame): + df = cudf.DataFrame.from_pandas(df) + if isinstance(new_edges_df, pd.DataFrame): + new_edges_df = cudf.DataFrame.from_pandas(new_edges_df) + + return hydrate_graph(res, df, new_edges_df, node, src, dst, emb_orig, X_orig, y_orig) diff --git a/graphistry/compute/cluster.py b/graphistry/compute/cluster.py index 2d742b422b..8d2c65d8c9 100644 --- a/graphistry/compute/cluster.py +++ b/graphistry/compute/cluster.py @@ -1,16 +1,23 @@ -import logging -import pandas as pd -import numpy as np - from typing import Any, List, Union, TYPE_CHECKING, Tuple, Optional from typing_extensions import Literal from collections import Counter - +from inspect import getmodule +import numpy as np +import pandas as pd +import logging +import warnings + +from graphistry.Engine import Engine, resolve_engine +from graphistry.models.compute.dbscan import ( + DBSCANEngine, DBSCANEngineAbstract, + dbscan_engine_values +) +from graphistry.models.compute.features import GraphEntityKind, graph_entity_kind_values from graphistry.Plottable import Plottable -from graphistry.constants import CUML, UMAP_LEARN, DBSCAN # noqa type: ignore -from graphistry.features import ModelDict +from graphistry.constants import CUML, DBSCAN +from graphistry.models.ModelDict import ModelDict from graphistry.feature_utils import get_matrix_by_column_parts -from graphistry.utils.lazy_import import lazy_cudf_import, lazy_dbscan_import +from graphistry.utils.lazy_import import lazy_dbscan_import logger = logging.getLogger("compute.cluster") @@ -19,56 +26,64 @@ else: MIXIN_BASE = object -DBSCANEngineConcrete = Literal["cuml", "umap_learn"] -DBSCANEngine = Literal[DBSCANEngineConcrete, "auto"] +def resolve_dbscan_engine( + engine: DBSCANEngineAbstract, + g_or_df: Optional[Any] = None +) -> DBSCANEngine: + """ + Resolves the engine to use for DBSCAN clustering -def resolve_cpu_gpu_engine( - engine: DBSCANEngine, -) -> DBSCANEngineConcrete: # noqa - if engine in [CUML, UMAP_LEARN, 'sklearn']: + If 'auto', decide by checking if cuml or sklearn is installed, and if provided, natural type of the dataset. GPU is used if both a GPU dataset and GPU library is installed. Otherwise, CPU library. + """ + if engine in dbscan_engine_values: return engine # type: ignore - if engine in ["auto"]: + if engine == "umap_learn": + warnings.warn("engine value 'umap_learn' is deprecated, use engine='cuml' or 'sklearn' instead; defaulting to sklearn") + return "sklearn" + if engine == "auto": + + preferred_engine = None if g_or_df is None else resolve_engine('auto', g_or_df) + if preferred_engine in [Engine.DASK, Engine.DASK_CUDF]: + raise ValueError('dask not supported for DBSCAN clustering, .compute() values first') + assert preferred_engine in [None, Engine.PANDAS, Engine.CUDF] + ( has_min_dependency, _, has_cuml_dependency, _, ) = lazy_dbscan_import() - if has_cuml_dependency: + if has_cuml_dependency and preferred_engine in [None, 'cudf']: return "cuml" if has_min_dependency: - return "umap_learn" - - raise ValueError( # noqa - f'engine expected to be "auto", ' - '"umap_learn", "pandas", "sklearn", or "cuml" ' - f"but received: {engine} :: {type(engine)}" - ) - -def make_safe_gpu_dataframes(X, y, engine): - """helper method to coerce a dataframe to the correct type (pd vs cudf)""" - def safe_cudf(X, y): - new_kwargs = {} - kwargs = {'X': X, 'y': y} - for key, value in kwargs.items(): - if isinstance(value, cudf.DataFrame) and engine in ["pandas", 'sklearn', 'umap_learn']: - new_kwargs[key] = value.to_pandas() - elif isinstance(value, pd.DataFrame) and engine == "cuml": - new_kwargs[key] = cudf.from_pandas(value) - else: - new_kwargs[key] = value - return new_kwargs['X'], new_kwargs['y'] - - has_cudf_dependancy_, _, cudf = lazy_cudf_import() - if has_cudf_dependancy_: - # print('DBSCAN CUML Matrices') - return safe_cudf(X, y) - else: - return X, y + return "sklearn" + + raise ValueError(f'Engine expected to be "auto" with cuml/sklearn installed, "sklearn", or "cuml", but received: {engine} :: {type(engine)}') + +def make_safe_gpu_dataframes( + X: Optional[Any], y: Optional[Any], engine: Engine +) -> Tuple[Optional[Any], Optional[Any]]: + """Coerce a dataframe to pd vs cudf based on engine""" + + assert engine in [Engine.PANDAS, Engine.CUDF], f"Expected engine to be 'pandas' or 'cudf', got {engine}" + + def df_as_dbscan_engine(df: Optional[Any], engine: Engine) -> Optional[Any]: + if df is None: + return None + if isinstance(df, pd.DataFrame) and engine == Engine.CUDF: + import cudf + return cudf.from_pandas(df) + elif 'cudf' in str(getmodule(df)) and engine == Engine.PANDAS: + return df.to_pandas() + return df + + return df_as_dbscan_engine(X, engine), df_as_dbscan_engine(y, engine) -def get_model_matrix(g, kind: str, cols: Optional[Union[List, str]], umap, target): +def get_model_matrix( + g: Plottable, kind: GraphEntityKind, cols: Optional[Union[List, str]], umap, target +) -> Any: """ Allows for a single function to get the model matrix for both nodes and edges as well as targets, embeddings, and features @@ -82,55 +97,96 @@ def get_model_matrix(g, kind: str, cols: Optional[Union[List, str]], umap, targe Returns: pd.DataFrame: dataframe of model matrix given the inputs """ - assert kind in ["nodes", "edges"] + assert kind in graph_entity_kind_values, f'Expected kind of {graph_entity_kind_values}, got: {kind}' assert ( hasattr(g, "_node_encoder") if kind == "nodes" else hasattr(g, "_edge_encoder") ) + engine = g._dbscan_engine + assert engine is not None, 'DBSCAN engine not set' + + df_engine: Engine = Engine.CUDF if engine == 'cuml' else Engine.PANDAS + + ### + + from graphistry.feature_utils import FeatureMixin + assert isinstance(g, FeatureMixin) + + # TODO does get_matrix do cudf? df = g.get_matrix(cols, kind=kind, target=target) + # TODO does _get_embedding do cudf? if umap and cols is None and g._umap is not None: - df = g._get_embedding(kind) + from graphistry.umap_utils import UMAPMixin + assert isinstance(g, UMAPMixin) + df = g._get_embedding(kind) - #if g.engine_dbscan in [CUML]: - df, _ = make_safe_gpu_dataframes(df, None, g.engine_dbscan) - #print('\n df:', df.shape, df.columns) - return df + df2, _ = make_safe_gpu_dataframes(df, None, df_engine) + return df2 -def dbscan_fit(g: Any, dbscan: Any, kind: str = "nodes", cols: Optional[Union[List, str]] = None, use_umap_embedding: bool = True, target: bool = False, verbose: bool = False): + +def dbscan_fit_inplace( + res: Plottable, dbscan: Any, kind: GraphEntityKind = "nodes", + cols: Optional[Union[List, str]] = None, use_umap_embedding: bool = True, + target: bool = False, verbose: bool = False +) -> None: """ Fits clustering on UMAP embeddings if umap is True, otherwise on the features dataframe or target dataframe if target is True. + Sets: + - `res._dbscan_edges` or `res._dbscan_nodes` to the DBSCAN model + - `res._edges` or `res._nodes` gains column `_dbscan` + Args: - :g: graphistry graph + :res: graphistry graph :kind: 'nodes' or 'edges' :cols: list of columns to use for clustering given `g.featurize` has been run :use_umap_embedding: whether to use UMAP embeddings or features dataframe for clustering (default: True) + :target: whether to use the target dataframe or features dataframe (typically False, for features) """ - X = get_model_matrix(g, kind, cols, use_umap_embedding, target) + X = get_model_matrix(res, kind, cols, use_umap_embedding, target) if X.empty: raise ValueError("No features found for clustering") - dbscan.fit(X) - # this is a future feature one cuml supports it - if g.engine_dbscan == 'cuml': - labels = dbscan.labels_.to_numpy() + logger.debug('dbscan_fit dbscan: %s', str(getmodule(dbscan))) + + labels: np.ndarray + if res._dbscan_engine == 'cuml': + import cupy as cp + from cuml import DBSCAN + assert isinstance(dbscan, DBSCAN), f'Expected cuml.DBSCAN, got: {type(dbscan)}' + dbscan.fit(X, calc_core_sample_indices=True) + labels = dbscan.labels_ + core_sample_indices = dbscan.core_sample_indices_ + + # Convert core_sample_indices_ to cupy if it's not already + # (Sometimes it's already cupy; if it's a CumlArray, we can cast or just index directly) + core_sample_indices_cupy = core_sample_indices.astype(cp.int32) + + # The actual core-sample points (a.k.a. "components_" in sklearn terms) + components = X[core_sample_indices_cupy] + dbscan.components_ = components + # dbscan.components_ = X[dbscan.core_sample_indices_.to_pandas()] # can't believe len(samples) != unique(labels) ... #cumlfail else: + from sklearn.cluster import DBSCAN + assert isinstance(dbscan, DBSCAN), f'Expected sklearn.DBSCAN, got: {type(dbscan)}' + dbscan.fit(X) labels = dbscan.labels_ if kind == "nodes": - g._nodes = g._nodes.assign(_dbscan=labels) + res._nodes = res._nodes.assign(_dbscan=labels) + res._dbscan_nodes = dbscan elif kind == "edges": - g._edges = g._edges.assign(_dbscan=labels) + res._edges = res._edges.assign(_dbscan=labels) + res._dbscan_edges = dbscan else: - raise ValueError("kind must be one of `nodes` or `edges`") + raise ValueError(f"kind must be one of `nodes` or `edges`, got {kind}") - kind = "node" if kind == "nodes" else "edge" - setattr(g, f"_{kind}_dbscan", dbscan) + setattr(res, f"_{kind}_dbscan", dbscan) if cols is not None: # set False since we used the features for verbose use_umap_embedding = False @@ -138,16 +194,12 @@ def dbscan_fit(g: Any, dbscan: Any, kind: str = "nodes", cols: Optional[Union[Li if verbose: cnt = Counter(labels) message = f"DBSCAN found {len(cnt)} clusters with {cnt[-1]} outliers" - print() - print('-' * len(message)) - print(message) - print(f"--fit on {'umap embeddings' if use_umap_embedding else 'feature embeddings'} of size {X.shape}") - print('-' * len(message)) - - return g + logger.debug(message) + logger.debug(f"--fit on {'umap embeddings' if use_umap_embedding else 'feature embeddings'} of size {X.shape} :: {X.dtypes}") -def dbscan_predict(X: pd.DataFrame, model: Any): +# TODO what happens in gpu mode? +def dbscan_predict_sklearn(X: pd.DataFrame, model: Any) -> np.ndarray: """ DBSCAN has no predict per se, so we reverse engineer one here from https://stackoverflow.com/questions/27822752/scikit-learn-predicting-new-points-with-dbscan @@ -169,22 +221,67 @@ def dbscan_predict(X: pd.DataFrame, model: Any): return y_new +def dbscan_predict_cuml(X: Any, model: Any) -> Any: + + import cudf + import cupy as cp + from sklearn.cluster import DBSCAN as skDBSCAN + from cuml import DBSCAN + #assert isinstance(X, cudf.DataFrame), f'Expected cudf.DataFrame, got: {type(X)}' + if isinstance(X, cudf.DataFrame): + X = X.to_pandas() + + if isinstance(X, pd.DataFrame) and isinstance(model, skDBSCAN): + return dbscan_predict_sklearn(X, model) + + assert isinstance(model, DBSCAN), f'Expected cuml.DBSCAN, got: {type(model)}' + + #raise NotImplementedError('cuml lacks predict, and for cpu fallback, components_') + warnings.warn('cuml lacks predict, cpu fallback, components_') + + n_samples = X.shape[0] + + y_new = np.ones(shape=n_samples, dtype=int) * -1 + + components = model.components_.to_pandas() if isinstance(model.components_, cudf.DataFrame) else model.components_ + + for i in range(n_samples): + diff = components - X.iloc[i, :].values # NumPy broadcasting + + dist = np.linalg.norm(diff, axis=1) # Euclidean distance + + shortest_dist_idx = np.argmin(dist) + + if dist[shortest_dist_idx] < model.eps: + y_new[i] = model.labels_[model.core_sample_indices_[shortest_dist_idx]] + + return y_new + + + class ClusterMixin(MIXIN_BASE): def __init__(self, *args, **kwargs): pass def _cluster_dbscan( - self, res, kind, cols, fit_umap_embedding, target, min_dist, min_samples, engine_dbscan, verbose, *args, **kwargs + self, kind: GraphEntityKind, cols, fit_umap_embedding, target, min_dist, min_samples, engine_dbscan: DBSCANEngineAbstract, verbose, *args, **kwargs ): """DBSCAN clustering on cpu or gpu infered by .engine flag """ _, DBSCAN, _, cuDBSCAN = lazy_dbscan_import() + res = self.bind() + + engine_dbscan = resolve_dbscan_engine(engine_dbscan, res) + if engine_dbscan in [CUML]: - print('`g.transform_dbscan(..)` not supported for engine=cuml, will return `g.transform_umap(..)` instead') + warnings.warn('`_cluster_dbscan(..)` experimental') + #engine_dbscan = 'sklearn' + + dbscan_engine = cuDBSCAN if engine_dbscan == CUML else DBSCAN - res.engine_dbscan = engine_dbscan # resolve_cpu_gpu_engine(engine_dbscan) # resolve_cpu_gpu_engine("auto") + res._dbscan_engine = engine_dbscan res._dbscan_params = ModelDict( "latest DBSCAN params", kind=kind, @@ -197,16 +294,8 @@ def _cluster_dbscan( verbose=verbose, ) - dbscan = ( - cuDBSCAN(eps=min_dist, min_samples=min_samples, *args, **kwargs) - if res.engine_dbscan == CUML - else DBSCAN(eps=min_dist, min_samples=min_samples, *args, **kwargs) - ) - # print('dbscan:', dbscan) - - res = dbscan_fit( - res, dbscan, kind=kind, cols=cols, use_umap_embedding=fit_umap_embedding, verbose=verbose - ) + dbscan = dbscan_engine(eps=min_dist, min_samples=min_samples, *args, **kwargs) + dbscan_fit_inplace(res, dbscan, kind=kind, cols=cols, use_umap_embedding=fit_umap_embedding, verbose=verbose) return res @@ -215,17 +304,19 @@ def dbscan( min_dist: float = 0.2, min_samples: int = 1, cols: Optional[Union[List, str]] = None, - kind: str = "nodes", + kind: GraphEntityKind = "nodes", fit_umap_embedding: bool = True, target: bool = False, verbose: bool = False, - engine_dbscan: str = 'sklearn', + engine_dbscan: DBSCANEngineAbstract = 'auto', *args, **kwargs, ): """DBSCAN clustering on cpu or gpu infered automatically. Adds a `_dbscan` column to nodes or edges. NOTE: g.transform_dbscan(..) currently unsupported on GPU. + Saves model as g._dbscan_nodes or g._dbscan_edges + Examples: :: @@ -268,9 +359,7 @@ def dbscan( """ - res = self.bind() - res = res._cluster_dbscan( - res, + res = self._cluster_dbscan( kind=kind, cols=cols, fit_umap_embedding=fit_umap_embedding, @@ -281,12 +370,12 @@ def dbscan( verbose=verbose, *args, **kwargs, - ) + ) # type: ignore return res def _transform_dbscan( - self, df: pd.DataFrame, ydf, kind, verbose + self: Plottable, df: pd.DataFrame, ydf, kind, verbose ) -> Tuple[Union[pd.DataFrame, None], pd.DataFrame, pd.DataFrame, pd.DataFrame]: res = self.bind() @@ -296,7 +385,7 @@ def _transform_dbscan( umap = res._dbscan_params["fit_umap_embedding"] target = res._dbscan_params["target"] - dbscan = res._node_dbscan if kind == "nodes" else res._edge_dbscan + dbscan = res._dbscan_nodes if kind == "nodes" else res._dbscan_edges # print('DBSCAN TYPE IN TRANSFORM', type(dbscan)) emb = None @@ -315,13 +404,13 @@ def _transform_dbscan( else: X_ = XX - if res.engine_dbscan == 'cuml': + if res._dbscan_engine == 'cuml': print('Transform DBSCAN not yet supported for engine_dbscan=`cuml`, use engine=`umap_learn`, `pandas` or `sklearn` instead') return emb, X, y, df - X_, emb = make_safe_gpu_dataframes(X_, emb, 'pandas') + X_, emb = make_safe_gpu_dataframes(X_, emb, Engine.PANDAS) - labels = dbscan_predict(X_, dbscan) # type: ignore + labels = dbscan_predict_cuml(X_, dbscan) # type: ignore #print('after dbscan predict', type(labels)) if umap and cols is None: df = df.assign(_dbscan=labels, x=emb.x, y=emb.y) # type: ignore @@ -399,10 +488,17 @@ def transform_dbscan( """ emb, X, y, df = self._transform_dbscan(df, y, kind=kind, verbose=verbose) if return_graph and kind not in ["edges"]: - df, y = make_safe_gpu_dataframes(df, y, 'pandas') - X, emb = make_safe_gpu_dataframes(X, emb, 'pandas') - g = self._infer_edges(emb, X, y, df, eps=min_dist, sample=sample, n_neighbors=n_neighbors, # type: ignore + #raise NotImplementedError("Engine specificity") + #if 'cudf' in str(getmodule(df)) or 'cudf' in str(getmodule(y)): + # warnings.warn("transform_dbscan using cpu fallback") + #df, y = make_safe_gpu_dataframes(df, y, Engine.PANDAS) + #X, emb = make_safe_gpu_dataframes(X, emb, Engine.PANDAS) + engine = self._dbscan_engine + engine_df = Engine.CUDF if engine == 'cuml' else Engine.PANDAS + df2, y2 = make_safe_gpu_dataframes(df, y, engine_df) + X2, emb2 = make_safe_gpu_dataframes(X, emb, engine_df) + g = self._infer_edges(emb2, X2, y2, df2, eps=min_dist, sample=sample, n_neighbors=n_neighbors, # type: ignore infer_on_umap_embedding=infer_umap_embedding - ) + ) return g return emb, X, y, df diff --git a/graphistry/compute/conditional.py b/graphistry/compute/conditional.py index df96c1c31f..b74d0f9d48 100644 --- a/graphistry/compute/conditional.py +++ b/graphistry/compute/conditional.py @@ -33,7 +33,7 @@ def conditional_probability(x, given, df: pd.DataFrame): pd.DataFrame: the conditional probability of x given the column 'given' """ - return df.groupby([ given ])[ x ].apply(lambda g : g.value_counts()/len(g)) # noqa type: ignore + return df.groupby([ given ])[ x ].apply(lambda g : g.value_counts() / len(g)) def probs(x, given, df: pd.DataFrame, how='index'): diff --git a/graphistry/constants.py b/graphistry/constants.py index f6fda05fd9..34cf504de7 100644 --- a/graphistry/constants.py +++ b/graphistry/constants.py @@ -43,8 +43,7 @@ # ############################################################## # for preprocessors namespace -# for dirty_cat params -DIRTY_CAT = "dirty_cat" +SKRUB = 'skrub' N_TOPICS_DEFAULT = 42 N_TOPICS_TARGET_DEFAULT = 7 N_HASHERS_DEFAULT = 100 diff --git a/graphistry/dgl_utils.py b/graphistry/dgl_utils.py index b3d82418ba..a49faec41d 100644 --- a/graphistry/dgl_utils.py +++ b/graphistry/dgl_utils.py @@ -1,6 +1,8 @@ # classes for converting a dataframe or Graphistry Plottable into a DGL from collections import Counter +from inspect import getmodule from typing import Dict, Optional, TYPE_CHECKING, Tuple +import warnings import numpy as np import pandas as pd @@ -111,12 +113,26 @@ def reindex_edgelist(df, src, dst): """ srclist = df[src] dstlist = df[dst] - cnt = Counter( - pd.concat([srclist, dstlist], axis=0) - ) # can also use pd.Factorize but doesn't order by count, which is satisfying - ordered_nodes_dict = {k: i for i, (k, c) in enumerate(cnt.most_common())} - df[config.SRC] = df[src].apply(lambda x: ordered_nodes_dict[x]) - df[config.DST] = df[dst].apply(lambda x: ordered_nodes_dict[x]) + if isinstance(df, pd.DataFrame): + cnt = Counter( + pd.concat([srclist, dstlist], axis=0) + ) # can also use pd.Factorize but doesn't order by count, which is satisfying + ordered_nodes_dict = {k: i for i, (k, c) in enumerate(cnt.most_common())} + df[config.SRC] = df[src].apply(lambda x: ordered_nodes_dict[x]) + df[config.DST] = df[dst].apply(lambda x: ordered_nodes_dict[x]) + elif 'cudf' in str(getmodule(df)): + import cudf + if isinstance(df, cudf.DataFrame): + cnt = Counter( + cudf.concat([srclist, dstlist], axis=0).to_pandas() + ) + ordered_nodes_dict = {k: i for i, (k, c) in enumerate(cnt.most_common())} + df[config.SRC] = cudf.Series.from_pandas(df[src].to_pandas().apply(lambda x: ordered_nodes_dict[x])) + df[config.DST] = cudf.Series.from_pandas(df[dst].to_pandas().apply(lambda x: ordered_nodes_dict[x])) + else: + raise ValueError("df must be cudf.DataFrame or pd.DataFrame") + else: + raise ValueError("df must be cudf.DataFrame or pd.DataFrame") return df, ordered_nodes_dict @@ -141,6 +157,10 @@ def pandas_to_sparse_adjacency(df, src, dst, weight_col): eweight = df[weight_col].values shape = len(ordered_nodes_dict) + if not isinstance(df, pd.DataFrame): + if 'cudf' in str(getmodule(df)): + warnings.warn("cudf not supported for coo_matrix, converting to pandas") + df = df.to_pandas() sp_mat = coo_matrix( (eweight, (df[config.SRC], df[config.DST])), shape=(shape, shape) ) @@ -232,8 +252,8 @@ def _remove_edges_not_in_nodes(self, node_column: str): if self._source is None or self._destination is None: raise ValueError("Need to have source and destination columns bound, call bind() or edges()") - if not isinstance(self._edges, pd.DataFrame): # type: ignore - raise ValueError("self._edges for DGLGraphMix must be pd.DataFrame, recieved: %s", type(self._edges)) # type: ignore + if 'cudf' not in str(getmodule(self._edges)) and not isinstance(self._edges, pd.DataFrame): + raise ValueError("self._edges for DGLGraphMix must be pd.DataFrame/cudf.DataFrame, recieved: %s", type(self._edges)) # type: ignore edf: pd.DataFrame = self._edges # type: ignore n_initial = len(edf) logger.info(f"Length of edge DataFrame {n_initial}") @@ -243,7 +263,7 @@ def _remove_edges_not_in_nodes(self, node_column: str): # print(f'OG: length: {len(edf)}') assert ( - sum(mask) > 2 + mask.sum() > 2 ), f"mask slice is (practically) empty, will lead to bad graph, found {sum(mask)}" self._MASK = mask # type: ignore self._edges = edf[mask] # type: ignore @@ -482,6 +502,10 @@ def build_gnn( reuse_if_existing=reuse_if_existing, *args, **kwargs) if featurize_edges: + if not isinstance(res._edges, pd.DataFrame): + if 'cudf' in str(getmodule(res._edges)): + warnings.warn("cudf not supported for featurizing edges, converting to pandas") + res = res.edges(res._edges.to_pandas()) # type: ignore res = res._featurize_edges_to_dgl( res, **kwargs_edges diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index 21dd56d8e8..66fad7a95f 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -15,17 +15,19 @@ Optional, Tuple, TYPE_CHECKING, -) # noqa -from typing_extensions import Literal # Literal native to py3.8+ +) from graphistry.compute.ComputeMixin import ComputeMixin from graphistry.config import config as graphistry_config from graphistry.features import ScalerType +from graphistry.models.compute.features import ( + GraphEntityKind, + FeatureEngineConcrete, FeatureEngine, feature_engine_concrete_values +) from graphistry.utils.lazy_import import ( lazy_sentence_transformers_import, lazy_import_has_min_dependancy, - lazy_dirty_cat_import, - assert_imported_text, + lazy_skrub_import, assert_imported ) from . import constants as config @@ -47,13 +49,13 @@ except ImportError: SentenceTransformer = Any # type:ignore try: - from dirty_cat import ( - SuperVectorizer, + from skrub import ( + TableVectorizer, GapEncoder, SimilarityEncoder, ) except: - SuperVectorizer = Any + TableVectorizer = Any GapEncoder = Any SimilarityEncoder = Any try: @@ -67,7 +69,7 @@ MIXIN_BASE = object Pipeline = Any SentenceTransformer = Any - SuperVectorizer = Any + TableVectorizer = Any GapEncoder = Any SimilarityEncoder = Any FunctionTransformer = Any @@ -107,15 +109,21 @@ def is_cudf_s(s: Any) -> bool: # # featurize_or_get_edges_dataframe_if_X_is_None -FeatureEngineConcrete = Literal["none", "pandas", "dirty_cat", "torch"] -FeatureEngine = Literal[FeatureEngineConcrete, "auto"] - def resolve_feature_engine( feature_engine: FeatureEngine, -) -> FeatureEngineConcrete: # noqa +) -> FeatureEngineConcrete: + + if feature_engine == "dirty_cat": + # deprecation warning + warnings.warn( + "dirty_cat is deprecated, please use skrub instead; attempting automatic conversion", + DeprecationWarning, + stacklevel=2 + ) + return "skrub" - if feature_engine in ["none", "pandas", "dirty_cat", "torch"]: + if feature_engine in feature_engine_concrete_values: return feature_engine # type: ignore if feature_engine == "auto": @@ -124,12 +132,12 @@ def resolve_feature_engine( return "torch" has_min_dependancy_, _ = lazy_import_has_min_dependancy() if has_min_dependancy_: - return "dirty_cat" + return "skrub" return "pandas" - raise ValueError( # noqa + raise ValueError( f'feature_engine expected to be "none", ' - '"pandas", "dirty_cat", "torch", or "auto"' + '"pandas", "skrub", "torch", or "auto"' f'but received: {feature_engine} :: {type(feature_engine)}' ) @@ -215,7 +223,7 @@ def safe_divide(a, b): b = np.array(b) return np.divide( a, b, out=np.zeros_like(a), where=b != 0.0, casting="unsafe" - ) # noqa + ) def features_without_target( @@ -230,22 +238,12 @@ def features_without_target( if y is None: return df remove_cols = [] - if y is None: - pass - elif isinstance(y, pd.DataFrame): - yc = y.columns - xc = df.columns - for c in yc: - if c in xc: - remove_cols.append(c) + if isinstance(y, pd.DataFrame): + remove_cols = list(df.columns.intersection(y.columns)) elif is_cudf_df(y): import cudf assert isinstance(y, cudf.DataFrame) - yc = y.columns - xc = df.columns - for c in yc: - if c in xc: - remove_cols.append(c) + remove_cols = list(df.columns.intersection(y.columns)) elif isinstance(y, pd.Series): if y.name and (y.name in df.columns): remove_cols = [y.name] @@ -255,16 +253,14 @@ def features_without_target( if y.name and (y.name in df.columns): remove_cols = [y.name] elif isinstance(y, List): - remove_cols = y # noqa + raise NotImplementedError("y-as-a-list not implemented") elif isinstance(y, str): - remove_cols = [y] + raise NotImplementedError("y-as-a-string not implemented") else: - logger.warning( - "Target is not of type(DataFrame) and has no columns" - ) # noqa + raise ValueError(f"Expected y target to be one of None, DF, Series, List, str, got: {type(y)}") if len(remove_cols): - logger.debug(f"Removing {remove_cols} columns from DataFrame") - tf = df.drop(columns=remove_cols, errors="ignore") # noqa + warnings.warn(f"Removing target y {remove_cols} columns from X DataFrame") + tf = df.drop(columns=remove_cols) return tf return df @@ -280,7 +276,7 @@ def remove_node_column_from_symbolic(X_symbolic, node): return X_symbolic.drop(columns=[node], errors="ignore") -def remove_internal_namespace_if_present(df: pd.DataFrame): +def remove_internal_namespace_if_present(df: pd.DataFrame) -> pd.DataFrame: """ Some tranformations below add columns to the DataFrame, this method removes them before featurization @@ -290,7 +286,7 @@ def remove_internal_namespace_if_present(df: pd.DataFrame): :return: DataFrame with dropped columns in reserved namespace """ if df is None: - return None + raise ValueError("DataFrame is None") # here we drop all _namespace like _x, _y, etc, so that # featurization doesn't include them idempotent-ly reserved_namespace : List[str] = [ @@ -313,6 +309,19 @@ def remove_internal_namespace_if_present(df: pd.DataFrame): return df +def drop_duplicates_with_warning(df: pd.DataFrame) -> pd.DataFrame: + duplicate_cols = df.columns.duplicated() + duplicates = [ + name for name, dupe in zip(df.columns, duplicate_cols) if dupe + ] + if duplicates: + warnings.warn( + f"Duplicate columns found: {duplicates}, using first of each." + ) + df = df[ [c for c, dupe in zip(df.columns, duplicate_cols) if not dupe] ] + return df + + # ########################################################################### # # Featurization Functions and Utils @@ -554,7 +563,7 @@ def get_preprocessing_pipeline( n_bins: int = 10, encode: str = "ordinal", strategy: str = "quantile", -) -> Pipeline: # noqa +) -> Pipeline: """Helper function for imputing and scaling np.ndarray data using different scaling transformers. :param X: np.ndarray @@ -645,9 +654,37 @@ def fit_pipeline( columns = X.columns index = X.index + was_cudf = 'cudf' in str(getmodule(X)) + if was_cudf: + warnings.warn("cudf DataFrames are being converted to pandas for preprocessing", UserWarning) + import cudf + if isinstance(X, cudf.DataFrame): + X = X.to_pandas() + elif isinstance(X, cudf.Series): + raise ValueError("cudf Series not supported") + else: + raise ValueError(f'Unexpected type for X: {type(X)}') + X = transformer.fit_transform(X) if keep_n_decimals: - X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa + if 'cudf' in str(getmodule(X)) or 'cupy' in str(getmodule(X)): + try: + # Is this a cupy, cudf, ? + X = X.round(decimals=keep_n_decimals) + except: + logger.error(f"Failed to round GPU object of type {type(X)}") + raise + else: + if isinstance(X, np.ndarray): + X = np.round(X, decimals=keep_n_decimals) + else: + X = X.round(decimals=keep_n_decimals) + + if was_cudf: + import cudf + if isinstance(X, pd.DataFrame): + return X + return cudf.DataFrame(X, columns=columns, index=index) return pd.DataFrame(X, columns=columns, index=index) @@ -755,7 +792,12 @@ def encode_textual( f"Encoded Textual Data using {model} at " f"{len(df) / ((time() - t) / 60):.2f} rows per minute" ) - res = pd.DataFrame(embeddings, + logger.debug('embeddings type=%s, df type=%s', type(embeddings), type(df)) + if 'cudf' in str(getmodule(df)): + import cudf + res = cudf.DataFrame(embeddings, columns=transformed_columns, index=df.index) + else: + res = pd.DataFrame(embeddings, columns=transformed_columns, index=df.index) @@ -879,11 +921,11 @@ def process_dirty_dataframes( ) -> Tuple[ pd.DataFrame, Optional[pd.DataFrame], - Union[SuperVectorizer, FunctionTransformer], - Union[SuperVectorizer, FunctionTransformer], + Union[TableVectorizer, FunctionTransformer], + Union[TableVectorizer, FunctionTransformer], ]: """ - Dirty_Cat encoder for record level data. Will automatically turn + skrub encoder for record level data. Will automatically turn inhomogeneous dataframe into matrix using smart conversion tricks. :param ndf: node DataFrame @@ -895,22 +937,21 @@ def process_dirty_dataframes( :param n_topics: number of topics for GapEncoder, default 42 :param similarity: one of 'ngram', 'levenshtein-ratio', 'jaro', or'jaro-winkler'}) – The type of pairwise string similarity - to use. If None or False, uses a SuperVectorizer + to use. If None or False, uses a TableVectorizer :return: Encoded data matrix and target (if not None), the data encoder, and the label encoder. """ - has_dirty_cat, _, dirty_cat = lazy_dirty_cat_import() - if has_dirty_cat: - from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder + has_skrub, _, skrub = lazy_skrub_import() + if has_skrub: + from skrub import TableVectorizer, GapEncoder, SimilarityEncoder from sklearn.preprocessing import FunctionTransformer t = time() all_numeric = is_dataframe_all_numeric(ndf) - if not all_numeric and has_dirty_cat and (feature_engine in ["dirty_cat", "torch"]): - data_encoder = SuperVectorizer( - auto_cast=True, + if not all_numeric and has_skrub and (feature_engine in ["skrub", "torch"]): + data_encoder = TableVectorizer( cardinality_threshold=cardinality_threshold, - high_card_cat_transformer=GapEncoder(n_topics), + high_cardinality=GapEncoder(n_topics), # numerical_transformer=StandardScaler(), This breaks # since -- AttributeError: Transformer numeric # (type StandardScaler) @@ -919,25 +960,34 @@ def process_dirty_dataframes( logger.info(":: Encoding DataFrame might take a few minutes ------") + if 'cudf' in str(getmodule(ndf)): + import cudf + assert isinstance(ndf, cudf.DataFrame) + logger.debug('Coercing cudf to pandas for skrub, with feature_engine=%s', feature_engine) + ndf_passthrough = ndf.to_pandas() + coercing_to_pandas = True + else: + ndf_passthrough = ndf + coercing_to_pandas = False + try: - X_enc = data_encoder.fit_transform(ndf, y) + X_enc = data_encoder.fit_transform(ndf_passthrough, y) except TypeError: - nndf = ndf.copy() + nndf = ndf_passthrough.copy() object_columns = nndf.select_dtypes(include=['object']).columns nndf[object_columns] = nndf[object_columns].astype(str) X_enc = data_encoder.fit_transform(nndf, y) logger.info("obj columns: %s are being converted to str", object_columns) X_enc = make_array(X_enc) - import warnings - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - features_transformed = data_encoder.get_feature_names_out() + #import warnings + #with warnings.catch_warnings(): + # warnings.filterwarnings("ignore", category=DeprecationWarning) + # warnings.filterwarnings("ignore", category=FutureWarning) + features_transformed = data_encoder.get_feature_names_out() - all_transformers = data_encoder.transformers - logger.debug(f"-Shape of [[dirty_cat fit]] data {X_enc.shape}") + all_transformers = data_encoder.transformers_ + logger.debug(f"-Shape of [[skrub fit]] data {X_enc.shape}") logger.debug(f"-Transformers: \n{all_transformers}\n") logger.debug( f"-Transformed Columns: \n{features_transformed[:20]}...\n" @@ -950,12 +1000,17 @@ def process_dirty_dataframes( data_encoder.get_feature_names_out = callThrough(features_transformed) X_enc = pd.DataFrame( - X_enc, columns=features_transformed, index=ndf.index + X_enc, columns=features_transformed, index=ndf_passthrough.index ) X_enc = X_enc.fillna(0.0) - elif not all_numeric and (not has_dirty_cat or feature_engine in ["pandas", "none"]): + + if coercing_to_pandas: + import cudf + X_enc = cudf.DataFrame.from_pandas(X_enc) + + elif not all_numeric and (not has_skrub or feature_engine in ["pandas", "none"]): numeric_ndf = ndf.select_dtypes(include=[np.number]) # type: ignore - logger.warning("-*-*- DataFrame is not numeric and no dirty_cat, dropping non-numeric") + logger.warning("-*-*- DataFrame is not numeric and no skrub, dropping non-numeric") X_enc, _, data_encoder, _ = get_numeric_transformers(numeric_ndf, None) else: logger.debug("-*-*- DataFrame is completely numeric") @@ -968,15 +1023,14 @@ def process_dirty_dataframes( y is not None and len(y.columns) > 0 # noqa: E126,W503 and not is_dataframe_all_numeric(y) # noqa: E126,W503 - and has_dirty_cat # noqa: E126,W503 + and has_skrub # noqa: E126,W503 ): t2 = time() logger.debug("-Fitting Targets --\n%s", y.columns) - label_encoder = SuperVectorizer( - auto_cast=True, + label_encoder = TableVectorizer( cardinality_threshold=cardinality_threshold_target, - high_card_cat_transformer=GapEncoder(n_topics_target) + high_cardinality=GapEncoder(n_topics_target) if not similarity else SimilarityEncoder( similarity=similarity, categories=categories, n_prototypes=2 @@ -986,17 +1040,16 @@ def process_dirty_dataframes( y_enc = label_encoder.fit_transform(y) y_enc = make_array(y_enc) - import warnings - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - if isinstance(label_encoder, SuperVectorizer) or isinstance( - label_encoder, FunctionTransformer - ): - labels_transformed = label_encoder.get_feature_names_out() - else: # Similarity Encoding uses categories_ - labels_transformed = label_encoder.categories_ + #import warnings + #with warnings.catch_warnings(): + # warnings.filterwarnings("ignore", category=DeprecationWarning) + # warnings.filterwarnings("ignore", category=FutureWarning) + if isinstance(label_encoder, TableVectorizer) or isinstance( + label_encoder, FunctionTransformer + ): + labels_transformed = label_encoder.get_feature_names_out() + else: # Similarity Encoding uses categories_ + labels_transformed = label_encoder.categories_ y_enc = pd.DataFrame(y_enc, columns=labels_transformed, @@ -1009,16 +1062,16 @@ def process_dirty_dataframes( # logger.debug(f"-Target Transformers used: # {label_encoder.transformers}\n") logger.debug( - "--Fitting SuperVectorizer on TARGET took" + "--Fitting TableVectorizer on TARGET took" f" {(time() - t2) / 60:.2f} minutes\n" ) elif ( y is not None and len(y.columns) > 0 # noqa: E126,W503 and not is_dataframe_all_numeric(y) # noqa: E126,W503 - and not has_dirty_cat # noqa: E126,W503 + and not has_skrub # noqa: E126,W503 ): - logger.warning("-*-*- y is not numeric and no dirty_cat, dropping non-numeric") + logger.warning("-*-*- y is not numeric and no skrub, dropping non-numeric") y2 = y.select_dtypes(include=[np.number]) # type: ignore y_enc, _, _, label_encoder = get_numeric_transformers(y2, None) else: @@ -1061,14 +1114,14 @@ def process_nodes_dataframes( Any, pd.DataFrame, Any, - SuperVectorizer, - SuperVectorizer, + TableVectorizer, + TableVectorizer, Optional[Pipeline], Optional[Pipeline], Any, List[str], ]: - """Automatic Deep Learning Embedding/ngrams of Textual Features, with the rest of the columns taken care of by dirty_cat + """Automatic Deep Learning Embedding/ngrams of Textual Features, with the rest of the columns taken care of by skrub :param df: pandas DataFrame of data :param y: pandas DataFrame of targets @@ -1177,7 +1230,7 @@ def process_nodes_dataframes( if not text_enc.empty and not X_enc.empty: logger.info("-" * 60) - logger.info("<= Found both a textual embedding + dirty_cat =>") + logger.info("<= Found both a textual embedding + skrub =>") X_enc = pd.concat( [text_enc, X_enc], axis=1 ) # np.c_[embeddings, X_enc.values] @@ -1286,6 +1339,17 @@ def encode_edges(edf, src, dst, mlb, fit=False): """ # uses mlb with fit=T/F so we can use it in transform mode # to recreate edge feature concat definition + if not isinstance(edf, pd.DataFrame): + if 'cudf' in str(getmodule(edf)): + import cudf + if isinstance(edf, cudf.DataFrame): + warnings.warn("edf is not a pandas DataFrame, converting") + edf = edf.to_pandas() + else: + raise ValueError(f'Unexpected type for edf: {type(edf)}') + else: + raise ValueError(f'Unexpected type for edf: {type(edf)}') + source = edf[src] destination = edf[dst] logger.debug("Encoding Edges using MultiLabelBinarizer") @@ -1459,8 +1523,16 @@ def process_edge_dataframes( if not X_enc.empty and not T.empty: logger.debug("-" * 60) - logger.debug("<= Found Edges and Dirty_cat encoding =>") - X_enc = pd.concat([T, X_enc], axis=1) + logger.debug("<= Found Edges and skrub encoding =>") + if isinstance(X_enc, pd.DataFrame): + X_enc = pd.concat([T, X_enc], axis=1) + else: + import cudf + if not isinstance(T, cudf.DataFrame): + T2 = cudf.DataFrame(T) + else: + T2 = T + X_enc = cudf.concat([T2, X_enc], axis=1) elif not T.empty and X_enc.empty: logger.debug("-" * 60) logger.debug("<= Found only Edges =>") @@ -1545,12 +1617,13 @@ def transform_text( def transform_dirty( df: pd.DataFrame, - data_encoder: Union[SuperVectorizer, FunctionTransformer], # type: ignore + data_encoder: Union[TableVectorizer, FunctionTransformer], # type: ignore name: str = "", ) -> pd.DataFrame: # from sklearn.preprocessing import MultiLabelBinarizer - logger.debug(f"-{name} Encoder:") - logger.debug(f"\t{data_encoder}\n") + #logger.debug(f"\n{'~' * 80}\n-{name} Encoder:") + #logger.debug(f"{data_encoder}\n") + #logger.debug('\n' + '=' * 80) # print(f"-{name} Encoder:") # print(f"\t{data_encoder}\n") # try: @@ -1558,35 +1631,101 @@ def transform_dirty( # except Exception as e: # logger.warning(e) # pass - logger.debug(f"TRANSFORM pre as df -- \t{df.shape}") + #logger.debug(f"TRANSFORM pre as df -- \t{df.dtypes}|{df.shape}") # ##################################### for dirty_cat 0.3.0 use_columns = getattr(data_encoder, 'columns_', []) if len(use_columns): - #print(f"Using columns: {use_columns}") + #logger.debug(f"Using columns: {use_columns}") X = data_encoder.transform(df[df.columns.intersection(use_columns)]) # ##################################### with dirty_cat 0.2.0 else: + logger.debug(f"Using all columns: {df.columns}") + logger.debug('data_encoder: %s', data_encoder) + feature_names_in = None + + try: + from skrub import TableVectorizer + if isinstance(data_encoder, TableVectorizer): + feature_names_in = data_encoder.feature_names_in_ + except ImportError: + pass + except ModuleNotFoundError: + pass + + try: + from sklearn.preprocessing import FunctionTransformer + if isinstance(data_encoder, FunctionTransformer): + if hasattr(data_encoder, 'get_feature_names_in_'): + #sklearn 1.x+ + feature_names_in = data_encoder.feature_names_in_.tolist() + except ImportError: + pass + except ModuleNotFoundError: + pass + + if feature_names_in is not None: + missing_cols = set(feature_names_in).difference(set(df.columns)) + if missing_cols: + raise ValueError(f'All fit X df columns must appear in df columns to transform, missing {missing_cols}, received {df.columns}, expected all of {feature_names_in}') + excess_cols = set(df.columns).difference(set(feature_names_in)) + if excess_cols: + warnings.warn(f'Dropping extra columns in df: {excess_cols}, expected only {feature_names_in}') + df = df[feature_names_in] + if list(df.columns) != list(feature_names_in): + df = df[feature_names_in] # sort + X = data_encoder.transform(df) # ################################### # X = data_encoder.transform(df) - logger.debug(f"TRANSFORM DIRTY as Matrix -- \t{X.shape}") + logger.debug(f"TRANSFORM DIRTY as Matrix -- \t{X.dtypes}|{X.shape}") X = make_array(X) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - warnings.filterwarnings("ignore", category=UserWarning) - X = pd.DataFrame( - X, columns=data_encoder.get_feature_names_out(), index=df.index - ) - logger.debug(f"TRANSFORM DIRTY dataframe -- \t{X.shape}") + X = pd.DataFrame( + X, columns=data_encoder.get_feature_names_out(), index=df.index + ) + logger.debug(f"TRANSFORM DIRTY dataframe -- \t{X.dtypes}|{X.shape}") return X +# TODO make a similar variant that coerces to fit() schema: subsetting & sorting +def normalize_X_y( + X: pd.DataFrame, + y: pd.DataFrame, + feature_names_in: Optional[pd.Index] = None, + target_names_in: Optional[pd.Index] = None, +) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Prepare for most finnicky featurizers: drop duplicates, and remove targets from data + + Warns on fixed violations + """ + + X = drop_duplicates_with_warning(X) + if feature_names_in is not None: + extra_cols = set(X.columns).difference(set(feature_names_in)) + if extra_cols: + warnings.warn(f'Dropping extra columns in X: {extra_cols}, expected only {feature_names_in}') + X = X[[c for c in feature_names_in]] + + if len(y.columns) > 0: + y = drop_duplicates_with_warning(y) + if target_names_in is not None: + extra_cols = set(y.columns).difference(set(target_names_in)) + if extra_cols: + warnings.warn(f'Dropping extra columns in y: {extra_cols}, expected only {target_names_in}') + y = y[[c for c in target_names_in]] + + X = features_without_target(X, y) + + return X, y + + def transform( - df: pd.DataFrame, ydf: pd.DataFrame, res: List, kind: str, src, dst + df: pd.DataFrame, ydf: Optional[pd.DataFrame], + res: List, kind: str, src, dst, + feature_names_in: pd.Index, target_names_in: pd.Index ) -> Tuple[pd.DataFrame, pd.DataFrame]: # here res is the featurization result, # this function aligns with what is computed during @@ -1604,8 +1743,39 @@ def transform( text_cols, ) = res - logger.info("-" * 90) + #if not hasattr(self, '_feature_params'): + # raise ValueError('Must first run `g.umap()` or `g.featurize()` before transforming data') + #if kind not in self._feature_params: + # raise ValueError(f'Must first run `g.umap(kind="{kind}")` or `g.featurize(kind="{kind}")` before transforming data') + #trained_params = self._feature_params[kind] + + + prev_def = df + if len(df.columns) == 0: + raise ValueError('df must have columns to transform data') + if ydf is None: + ydf = pd.DataFrame([]) + + df, ydf = normalize_X_y(df, ydf, feature_names_in, target_names_in) + + if len(df.columns) == 0: + raise ValueError(f'df must have columns to transform data, received X={prev_def.columns}, y={ydf.columns}, returned {df.columns}') + + assert df is not None, 'df must be provided to transform data' + X_df_intersection = df.columns.intersection(feature_names_in) + missing_cols = feature_names_in.difference(X_df_intersection) + assert set(X_df_intersection) == set(feature_names_in), f'All fit X df columns must appear in df columns to transform, missing {missing_cols}, received {df.columns}, expected all of {feature_names_in}' + if list(df.columns) != list(feature_names_in): + df = df[feature_names_in] # sort + + if len(ydf.columns) > 0: + y_df_intersection = ydf.columns.intersection(target_names_in) + missing_cols = target_names_in.difference(y_df_intersection) + assert set(y_df_intersection) == set(target_names_in), f'All fit y df columns must appear in new transformed y df columns, missing {missing_cols}, received {ydf.columns}' + if list(ydf.columns) != list(target_names_in): + ydf = ydf[target_names_in] # sort + # index = df.index y = pd.DataFrame([]) T = pd.DataFrame([]) @@ -1624,7 +1794,7 @@ def transform( data_encoder, "Numeric or Dirty Edge-Features") - if ydf is not None: + if len(ydf.columns) > 0: logger.info("-Transforming Target--") y = transform_dirty(ydf, label_encoder, name=f"{kind.title()}-Label") @@ -1638,15 +1808,15 @@ def transform( tX = transform_text(res_df, text_model, text_cols) logger.info("** text features are empty") if tX.empty else None - # concat text to dirty_cat, with text in front. + # concat text to skrub, with text in front. if not tX.empty and not X.empty: X = pd.concat([tX, X], axis=1) - logger.info("--Combining both Textual and Numeric/Dirty_Cat") + logger.info("--Combining both Textual and Numeric/skrub") elif not tX.empty and X.empty: X = tX # textual logger.info("--Just textual") elif not X.empty: - logger.info("--Just Numeric/Dirty_Cat transformer") + logger.info("--Just Numeric/skrub transformer") X = X # dirty/Numeric else: logger.info("-" * 60) @@ -1655,7 +1825,7 @@ def transform( # now if edges, add T at front if kind == "edges": - X = pd.concat([T, X], axis=1) # edges, text, dirty_cat + X = pd.concat([T, X], axis=1) # edges, text, skrub logger.info("-Combining MultiLabelBinarizer with previous features") logger.info("-" * 40) @@ -1675,10 +1845,11 @@ def transform( class FastEncoder: - def __init__(self, df, y=None, kind="nodes"): + def __init__(self, df: pd.DataFrame, y: pd.DataFrame, kind="nodes"): + df, y = normalize_X_y(df, y) self._df = df self.feature_names_in = df.columns - self._y = pd.DataFrame([], index=df.index) if y is None else y + self._y = y self.target_names_in = self._y.columns self.kind = kind self._assertions() @@ -1733,15 +1904,15 @@ def _set_result(self, res): text_cols, ] = self.res - self._hecho(res) + # self._hecho(res) # data_encoder.feature_names_in = self.feature_names_in # label_encoder.target_names_in = self.target_names_in self.feature_columns = X_enc.columns self.feature_columns_target = y_enc.columns self.X = X_encs self.y = y_encs - self.X_orignal = X_enc - self.y_orignal = y_enc + self.X_original = X_enc # resolved at fit + self.y_original = y_enc # resolved at fit self.data_encoder = data_encoder # is list for edges self.label_encoder = label_encoder self.scaling_pipeline = scaling_pipeline @@ -1757,14 +1928,14 @@ def fit(self, src=None, dst=None, *args, **kwargs): ) self._set_result(res) - def transform(self, df, ydf=None): + def transform(self, df: pd.DataFrame, ydf: Optional[pd.DataFrame] = None): "Raw transform, no scaling." - X, y = transform(df, ydf, self.res, self.kind, self.src, self.dst) + X, y = transform(df, ydf, self.res, self.kind, self.src, self.dst, self.feature_names_in, self.target_names_in) return X, y - def _transform_scaled(self, df, ydf, scaling_pipeline, scaling_pipeline_target): + def _transform_scaled(self, df: pd.DataFrame, ydf: Optional[pd.DataFrame], scaling_pipeline, scaling_pipeline_target): """Transform with scaling fit durning fit.""" - X, y = transform(df, ydf, self.res, self.kind, self.src, self.dst) + X, y = transform(df, ydf, self.res, self.kind, self.src, self.dst, self._df.columns, self._y.columns) if scaling_pipeline is not None and not X.empty: X = pd.DataFrame(scaling_pipeline.transform(X), columns=X.columns, index=X.index) if scaling_pipeline_target is not None and y is not None and not y.empty: @@ -1956,7 +2127,7 @@ def _featurize_nodes( X_resolved = resolve_X(ndf, X) y_resolved = resolve_y(ndf, y) - from .features import ModelDict + from graphistry.models.ModelDict import ModelDict fkwargs = ModelDict("Featurize Params", X=X_resolved, @@ -2004,8 +2175,7 @@ def _featurize_nodes( return fresh_res - X_resolved = features_without_target(X_resolved, y_resolved) - X_resolved = remove_internal_namespace_if_present(X_resolved) + X_resolved, y_resolved = normalize_X_y(X_resolved, y_resolved) keys_to_remove = ["X", "y", "remove_node_column"] nfkwargs = dict() @@ -2021,9 +2191,9 @@ def _featurize_nodes( # if changing, also update fresh_res res._node_features = encoder.X - res._node_features_raw = encoder.X_orignal # .copy() + res._node_features_raw = encoder.X_original # .copy() res._node_target = encoder.y - res._node_target_raw = encoder.y_orignal # .copy() + res._node_target_raw = encoder.y_original # .copy() res._node_encoder = encoder # now this does # all the work `._node_encoder.transform(df, y)` etc @@ -2077,6 +2247,8 @@ def _featurize_edges( **{res._destination: res._edges[res._destination]} ) + X_resolved, y_resolved = normalize_X_y(X_resolved, y_resolved) + # now that everything is set fkwargs = dict( X=X_resolved, @@ -2121,8 +2293,6 @@ def _featurize_edges( return fresh_res - X_resolved = features_without_target(X_resolved, y_resolved) - keys_to_remove = [ "X", "y", @@ -2140,9 +2310,9 @@ def _featurize_edges( # if editing, should also update fresh_res res._edge_features = encoder.X - res._edge_features_raw = encoder.X_orignal # .copy() + res._edge_features_raw = encoder.X_original # .copy() res._edge_target = encoder.y - res._edge_target_raw = encoder.y_orignal # .copy() + res._edge_target_raw = encoder.y_original # .copy() res._edge_encoder = encoder return res @@ -2405,7 +2575,7 @@ def featurize( https://scikit-learn.org/stable/modules/preprocessing.html Here 'standard' corresponds to 'StandardScaler' in scikits. :param use_scaler_target: selects which scaler to scale the target - :param cardinality_threshold: dirty_cat threshold on cardinality of + :param cardinality_threshold: skrub threshold on cardinality of categorical labels across columns. If value is greater than threshold, will run GapEncoder (a topic model) on column. @@ -2815,7 +2985,7 @@ def featurize_or_get_edges_dataframe_if_X_is_None( ) - def get_matrix(self, columns: Optional[Union[List, str]] = None, kind: str = 'nodes', target: bool = False) -> pd.DataFrame: + def get_matrix(self, columns: Optional[Union[List, str]] = None, kind: GraphEntityKind = 'nodes', target: bool = False) -> pd.DataFrame: """Returns feature matrix, and if columns are specified, returns matrix with only the columns that contain the string `column_part` in their name.`X = g.get_matrix(['feature1', 'feature2'])` will retrieve a feature matrix with only the columns that contain the string `feature1` or `feature2` in their name. Most useful for topic modeling, where the column names are of the form `topic_0: descriptor`, `topic_1: descriptor`, etc. Can retrieve unique columns in original dataframe, or actual topic features like [ip_part, shoes, preference_x, etc]. Powerful way to retrieve features from a featurized graph by column or (top) features of interest. **Example:** diff --git a/graphistry/models/ModelDict.py b/graphistry/models/ModelDict.py new file mode 100644 index 0000000000..74128c3d76 --- /dev/null +++ b/graphistry/models/ModelDict.py @@ -0,0 +1,51 @@ + +from collections import UserDict +import logging + + +logger = logging.getLogger(__name__) + + +def get_timestamp() -> str: + import datetime + + return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +class ModelDict(UserDict): + """Helper class to print out model names and keep track of updates + + Args: + message: description of model + verbose: print out model names, logging happens regardless + """ + + def __init__(self, message, verbose=True, _timestamp=False, *args, **kwargs): + self._message = message + self._verbose = verbose + self._timestamp = _timestamp # do no use this inside the class, as it will trigger memoization. Only use outside of class. + L = ( + len(message) + if _timestamp is False + else max(len(message), len(get_timestamp()) + 1) + ) + self._print_length = min(80, L) + self._updates = [] + super().__init__(*args, **kwargs) + + def print(self, message): + if self._timestamp: + message = f"{message}\n{get_timestamp()}" + if self._verbose: + logger.debug('ModelDict: %s', message) + + def __repr__(self): + return super().__repr__() + + def update(self, *args, **kwargs): + self._updates.append(args[0]) + if len(self._updates) > 1: # don't take first update since its the init/default + self._message += ( + "\n" + "_" * self._print_length + f"\n\nUpdated: {self._updates[-1]}" + ) + return super().update(*args, **kwargs) diff --git a/graphistry/models/compute/dbscan.py b/graphistry/models/compute/dbscan.py new file mode 100644 index 0000000000..47fee34329 --- /dev/null +++ b/graphistry/models/compute/dbscan.py @@ -0,0 +1,9 @@ +from typing import Set +from typing_extensions import Literal + + +DBSCANEngine = Literal["cuml", "sklearn"] +DBSCANEngineAbstract = Literal[DBSCANEngine, "auto"] + +dbscan_engine_values: Set[DBSCANEngine] = {"cuml", "sklearn"} +dbscan_engine_abstract_values: Set[DBSCANEngineAbstract] = {"cuml", "sklearn", "auto"} diff --git a/graphistry/models/compute/features.py b/graphistry/models/compute/features.py new file mode 100644 index 0000000000..9c2ea91924 --- /dev/null +++ b/graphistry/models/compute/features.py @@ -0,0 +1,9 @@ +from typing import Set +from typing_extensions import Literal + +GraphEntityKind = Literal['nodes', 'edges'] +graph_entity_kind_values: Set[GraphEntityKind] = {'nodes', 'edges'} + +FeatureEngineConcrete = Literal["none", "pandas", "skrub", "torch"] +FeatureEngine = Literal[FeatureEngineConcrete, "dirty_cat", "auto"] +feature_engine_concrete_values: Set[FeatureEngineConcrete] = {"none", "pandas", "skrub", "torch"} diff --git a/graphistry/models/compute/umap.py b/graphistry/models/compute/umap.py new file mode 100644 index 0000000000..779e26db35 --- /dev/null +++ b/graphistry/models/compute/umap.py @@ -0,0 +1,7 @@ +from typing import Set +from typing_extensions import Literal + +UMAPEngineConcrete = Literal['cuml', 'umap_learn'] +UMAPEngine = Literal[UMAPEngineConcrete, "auto"] + +umap_engine_values: Set[UMAPEngineConcrete] = {'cuml', 'umap_learn'} diff --git a/graphistry/outliers.py b/graphistry/outliers.py index fa17377169..283cd7848d 100644 --- a/graphistry/outliers.py +++ b/graphistry/outliers.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple +from typing import Dict, Union, Tuple import pandas as pd import logging @@ -94,7 +94,7 @@ def plot_outliers( fig (matplotlib.figure.Figure): figure of the plot and axes """ colors = ["m", "r", "b", "g"] - legend1 = {} + legend1: Dict[str, plt.QuadContourSet] = {} if xy_extent is None: mminx, mmaxx = embedding.T[0].min(), embedding.T[0].max() mminy, mmaxy = embedding.T[1].min(), embedding.T[1].max() @@ -145,10 +145,10 @@ def plot_outliers( plt.ylim((yy.min(), yy.max())) plt.legend( ( - legend1_values_list[0].collections[0], - legend1_values_list[1].collections[0], - legend1_values_list[2].collections[0], - legend1_values_list[3].collections[0], + legend1_values_list[0], + legend1_values_list[1], + legend1_values_list[2], + legend1_values_list[3], ), ( legend1_keys_list[0], diff --git a/graphistry/plugins/cugraph.py b/graphistry/plugins/cugraph.py index 2dd048faa2..d922cec06b 100644 --- a/graphistry/plugins/cugraph.py +++ b/graphistry/plugins/cugraph.py @@ -1,5 +1,7 @@ -import pandas as pd from typing import Any, Dict, List, Optional, Union +import pandas as pd +import warnings + from graphistry.constants import NODE from graphistry.Engine import EngineAbstract from graphistry.Plottable import Plottable @@ -8,7 +10,6 @@ logger = setup_logger(__name__) - #import logging #logger.setLevel(logging.DEBUG) @@ -38,7 +39,7 @@ def from_cugraph(self, ) -> Plottable: """ - If bound IDs, use the same IDs in the returned graph. + Take input cugraph.Graph object and load in data and bindings (source, destination, edge_weight) If non-empty nodes/edges, instead of returning G's topology, use existing topology and merge in G's attributes @@ -50,8 +51,28 @@ def from_cugraph(self, #### - src = self._source or SRC_CUGRAPH - dst = self._destination or DST_CUGRAPH + if hasattr(G, 'source_columns') and G.source_columns is not None: + s = G.source_columns + if isinstance(s, list): + s = s[0] + assert isinstance(s, str), "Found G.source_columns, and expected it to be a string or a list of one string, but was: %s" % G.souurce_columns + if self._source is not None and self._source != s: + warnings.warn('Switching g source column name to G source column name') + else: + s = self._source or SRC_CUGRAPH + src = s + + if hasattr(G, 'destination_columns') and G.destination_columns is not None: + d = G.destination_columns + if isinstance(d, list): + d = d[0] + assert isinstance(d, str), "Found G.destination_columns, and expected it to be a string or a list of one string, but was: %s" % G.destination_columns + if self._destination is not None and self._destination != d: + warnings.warn('Switching g destination column name to G destination column name') + else: + d = self._destination or DST_CUGRAPH + dst = d + edges_gdf = G.view_edge_list() # src, dst if g._nodes is not None and load_nodes: @@ -326,7 +347,15 @@ def compute_cugraph_core( out = out[0] if out_col is not None: raise ValueError('Graph returned, but out_col was specified') - return from_cugraph(self, out, load_nodes=False) + self2 = self + if self._source != out.source_columns: + logger.debug('Switching g source column name to G source column name to work around cugraph inconsistency') + if out.source_columns == 'src': + self2 = self.edges(self._edges.rename(columns={self._source: 'src', self._destination: 'dst'}), 'src', 'dst') + res = from_cugraph(self2, out, load_nodes=False) + if not (self2 is self): + res = res.edges(self._edges, self._source, self._destination) + return res raise ValueError('Unsupported algorithm: %s', alg) diff --git a/graphistry/tests/layout/ring/test_ring_categorical.py b/graphistry/tests/layout/ring/test_ring_categorical.py index 3b4ad40c80..2d4f055c68 100644 --- a/graphistry/tests/layout/ring/test_ring_categorical.py +++ b/graphistry/tests/layout/ring/test_ring_categorical.py @@ -131,7 +131,7 @@ def test_ring_cudf(self): rs = (g._nodes['x'] * g._nodes['x'] + g._nodes['y'] * g._nodes['y']).apply(np.sqrt) assert rs.min() == 500 assert rs.max() == 800 - assert len(g._complex_encodings and g._complex_encodings['node_encodings']['default']['pointAxisEncoding']['rows']) == 5 + assert len(g._complex_encodings and g._complex_encodings['node_encodings']['default']['pointAxisEncoding']['rows']) == 4 for i, row in enumerate(g._complex_encodings['node_encodings']['default']['pointAxisEncoding']['rows']): assert row['r'] == 500 + 100 * i - assert row['label'] == str(2 + 2 * i) + assert row['label'] == ['a', 'bb', 'cc', 'dd'][i] diff --git a/graphistry/tests/plugins/test_cugraph.py b/graphistry/tests/plugins/test_cugraph.py index 7f22354659..883ac452e0 100644 --- a/graphistry/tests/plugins/test_cugraph.py +++ b/graphistry/tests/plugins/test_cugraph.py @@ -69,8 +69,8 @@ def test_minimal_edges(self): g = graphistry.from_cugraph(G, load_nodes=False) assert g._nodes is None and g._node is None assert g._source is not None and g._destination is not None - assert g._source == SRC_CUGRAPH - assert g._destination == DST_CUGRAPH + assert g._source == 'a' + assert g._destination == 'b' assert g._edges is not None assert isinstance(g._edges, cudf.DataFrame) assert len(g._edges) == len(edges) @@ -88,14 +88,14 @@ def test_minimal_attributed_edges(self): assert g._nodes is None and g._node is None assert len(g._edges) == len(edges) assert g._source is not None and g._destination is not None - assert g._source == SRC_CUGRAPH - assert g._destination == DST_CUGRAPH + assert g._source == 'a' + assert g._destination == 'b' assert g._edges is not None assert isinstance(g._edges, cudf.DataFrame) assert len(g._edges) == len(edges) assert len(g._edges[g._source].dropna()) == len(edges) assert len(g._edges[g._destination].dropna()) == len(edges) - assert (g._edges['weights'].to_pandas() == edges_w['w']).all() + assert (g._edges['w'].to_pandas() == edges_w['w']).all() def test_merge_existing_edges_pandas(self): @@ -191,8 +191,8 @@ def test_minimal_edges(self): logger.debug('G: %s', G) g2 = graphistry.from_cugraph(G) assert g2._edges.shape == g._edges.shape - assert g2._source == SRC_CUGRAPH - assert g2._destination == DST_CUGRAPH + assert g2._source == g._source + assert g2._destination == g._destination assert g2._edge is None assert g2._nodes is None and g2._node is None #logger.debug('g2._nodes: %s', g2._nodes) @@ -249,8 +249,8 @@ def test_minimal_edges_str(self): logger.debug('G: %s', G) g2 = graphistry.from_cugraph(G) assert g2._edges.shape == g._edges.shape - assert g2._source == SRC_CUGRAPH - assert g2._destination == DST_CUGRAPH + assert g2._source == g._source + assert g2._destination == g._destination assert g2._edge is None assert ( g2._edges @@ -283,8 +283,8 @@ def test_nodes(self): logger.debug('ig: %s', G) g2 = graphistry.from_cugraph(G).materialize_nodes() assert g2._edges.shape == g._edges.shape - assert g2._source == SRC_CUGRAPH - assert g2._destination == DST_CUGRAPH + assert g2._source == g._source + assert g2._destination == g._destination assert g2._edge is None assert g2._node == 'id' logger.debug('g2._nodes: %s', g2._nodes) @@ -336,8 +336,8 @@ def test_drop_nodes(self): logger.debug('G: %s', G) g2 = graphistry.from_cugraph(G).materialize_nodes() assert g2._edges.shape == g._edges.shape - assert g2._source == SRC_CUGRAPH - assert g2._destination == DST_CUGRAPH + assert g2._source == g._source + assert g2._destination == g._destination assert g2._edge is None logger.debug('g2._nodes: %s', g2._nodes) logger.debug('other: %s', nodes) @@ -604,6 +604,8 @@ def test_all_calls(self): edges3_gdf = cudf.from_pandas(edges3_df) g = graphistry.edges(edges3_gdf, 'a', 'b').bind(edge_weight='f').materialize_nodes() + assert g._source == 'a' + assert g._destination == 'b' for alg in [x for x in compute_algs]: if alg not in skiplist: opts = overrides[alg] if alg in overrides else {} diff --git a/graphistry/tests/test_bolt_util.py b/graphistry/tests/test_bolt_util.py index c79c625955..a9cd3493c6 100644 --- a/graphistry/tests/test_bolt_util.py +++ b/graphistry/tests/test_bolt_util.py @@ -1,6 +1,12 @@ # -*- coding: utf-8 -*- -import datetime as dt, graphistry, neo4j, os, numpy as np, pandas as pd, pyarrow as pa, pytest +import datetime as dt, graphistry, os, numpy as np, pandas as pd, pyarrow as pa, pytest + +try: + import neo4j + has_neo4j = True +except (ImportError, ModuleNotFoundError): + has_neo4j = False from graphistry.bolt_util import ( neo_df_to_pd_df, @@ -13,6 +19,7 @@ ) +@pytest.mark.skipif(not has_neo4j, reason="No neo4j") def test_neo_df_to_pd_df_basics(): rec = {"x": 1, "b": True, "s": "abc", "a": [1, 2, 3], "d": {"r": "v"}, "mt": None} df = pd.DataFrame([rec]) @@ -30,6 +37,7 @@ def test_neo_df_to_pd_df_basics(): pa.Table.from_pandas(df2) +@pytest.mark.skipif(not has_neo4j, reason="No neo4j") def test_neo_df_to_pd_df_basics_na(): recs = { "x": [1, None], @@ -54,6 +62,7 @@ def test_neo_df_to_pd_df_basics_na(): pa.Table.from_pandas(df2) +@pytest.mark.skipif(not has_neo4j, reason="No neo4j") def test_dates_homogeneous(): rec = { "d": neo4j.time.Date(2020, 10, 20), @@ -79,6 +88,7 @@ def test_dates_homogeneous(): pa.Table.from_pandas(df2) +@pytest.mark.skipif(not has_neo4j, reason="No neo4j") def test_dates_homogeneous_na(): recs = { "d": [neo4j.time.Date(2020, 10, 20), None], @@ -110,6 +120,7 @@ def test_dates_homogeneous_na(): pa.Table.from_pandas(df2) +@pytest.mark.skipif(not has_neo4j, reason="No neo4j") def test_dates_heterogeneous(): recs = { "d": [neo4j.time.Date(2020, 10, 20), 1], @@ -143,6 +154,7 @@ def test_dates_heterogeneous(): pa.Table.from_pandas(df2) +@pytest.mark.skipif(not has_neo4j, reason="No neo4j") def test_spatial_homogenous(): rec = { "p": neo4j.spatial.Point([1, 2, 3]), @@ -196,6 +208,7 @@ def test_spatial_homogenous(): pa.Table.from_pandas(df2) +@pytest.mark.skipif(not has_neo4j, reason="No neo4j") def test_spatial_homogenous_na(): recs = { "p": [neo4j.spatial.Point([1, 2, 3, 4]), None], @@ -252,6 +265,7 @@ def test_spatial_homogenous_na(): pa.Table.from_pandas(df2) +@pytest.mark.skipif(not has_neo4j, reason="No neo4j") def test_spatial_heterogeneous(): recs = { "p": [neo4j.spatial.Point([1, 2, 3, 4]), 1], @@ -284,6 +298,7 @@ def test_spatial_heterogeneous(): pa.Table.from_pandas(df2) +@pytest.mark.skipif(not has_neo4j, reason="No neo4j") @pytest.mark.skipif( not ("WITH_NEO4J" in os.environ) or os.environ["WITH_NEO4J"] != "1", reason="No WITH_NEO4J=1", @@ -303,11 +318,6 @@ def test_neo4j_conn_setup(self): @pytest.mark.xfail(reason='Waiting on https://github.com/neo4j/neo4j-python-driver/pull/789') def test_neo4j_ready(self): - #To fully suppress, nest under; - #import warnings - #with warnings.catch_warnings(): - # warnings.filterwarnings("ignore", category=DeprecationWarning) - g = graphistry.cypher("MATCH (a)-[b]-(c) WHERE a <> c RETURN a, b, c LIMIT 1") assert len(g._nodes) == 2 diff --git a/graphistry/tests/test_compute_cluster.py b/graphistry/tests/test_compute_cluster.py index b9bcc77844..12f90cd700 100644 --- a/graphistry/tests/test_compute_cluster.py +++ b/graphistry/tests/test_compute_cluster.py @@ -2,8 +2,9 @@ import unittest import pytest import graphistry +from graphistry.Plottable import Plottable from graphistry.constants import DBSCAN -from graphistry.util import ModelDict +from graphistry.models.ModelDict import ModelDict from graphistry.utils.lazy_import import ( lazy_dbscan_import, lazy_umap_import @@ -17,13 +18,13 @@ class TestComputeCluster(unittest.TestCase): - def _condition(self, g, kind): + def _condition(self, g: Plottable, kind): if kind == 'nodes': - self.assertTrue(g._node_dbscan is not None, 'instance has no `_node_dbscan` method') + self.assertTrue(g._dbscan_nodes is not None, 'instance has no `_dbscan_nodes` method') self.assertTrue(DBSCAN in g._nodes, 'node df has no `_dbscan` attribute') #self.assertTrue(g._point_color is not None, 'instance has no `_point_color` method') else: - self.assertTrue(g._edge_dbscan is not None, 'instance has no `_edge_dbscan` method') + self.assertTrue(g._dbscan_edges is not None, 'instance has no `_dbscan_edges` method') self.assertTrue(DBSCAN in g._edges, 'edge df has no `_dbscan` attribute') @pytest.mark.skipif(not has_dbscan or not has_umap, reason="requires ai dependencies") diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py index fd30c30c8a..f4730603eb 100644 --- a/graphistry/tests/test_feature_utils.py +++ b/graphistry/tests/test_feature_utils.py @@ -8,7 +8,6 @@ import pytest import unittest -import warnings from graphistry.feature_utils import ( process_dirty_dataframes, @@ -31,7 +30,6 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) -warnings.filterwarnings("ignore") logging.getLogger("graphistry.feature_utils").setLevel(logging.DEBUG) model_avg_name = ( @@ -215,7 +213,7 @@ def test_get_col_matrix(self): assert all(self.g3.get_matrix().columns == self.g3._node_features.columns) # assert list(self.g3.get_matrix(['language', 'freedom']).columns) == freedom, self.g3.get_matrix(['language', 'freedom']).columns -class TestFastEncoder(unittest.TestCase): +class TestFastEncoderNode(unittest.TestCase): # we test how far off the fit returned values different from the transformed @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") @@ -225,6 +223,21 @@ def setUp(self): use_ngrams=True, ngram_range=(1, 1), use_scaler='robust', cardinality_threshold=100) self.X, self.Y = fenc.X, fenc.y self.x, self.y = fenc.transform(ndf_reddit, ydf=double_target_reddit) + + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") + def test_allclose_fit_transform_on_same_data_nodes(self): + check_allclose_fit_transform_on_same_data(self.X, self.x, self.Y, self.y) + + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") + def test_columns_match(self): + assert set(self.X.columns) == set(self.x.columns), 'Node Feature Columns do not match' + assert set(self.Y.columns) == set(self.y.columns), 'Node Target Columns do not match' + +class TestFastEncoderEdge(unittest.TestCase): + # we test how far off the fit returned values different from the transformed + + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") + def setUp(self): fenc = FastEncoder(edge_df2, y=edge2_target_df, kind='edges') fenc.fit(src='src', dst='dst', feature_engine=resolve_feature_engine('auto'), @@ -235,23 +248,20 @@ def setUp(self): self.Xe, self.Ye = fenc.X, fenc.y self.xe, self.ye = fenc.transform(edge_df2, ydf=edge2_target_df) - + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") - def test_allclose_fit_transform_on_same_data(self): - check_allclose_fit_transform_on_same_data(self.X, self.x, self.Y, self.y) + def test_allclose_fit_transform_on_same_data_edges(self): check_allclose_fit_transform_on_same_data(self.Xe, self.xe, self.Ye, self.ye) - + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") def test_columns_match(self): - assert all(self.X.columns == self.x.columns), 'Node Feature Columns do not match' - assert all(self.Y.columns == self.y.columns), 'Node Target Columns do not match' - assert all(self.Xe.columns == self.xe.columns), 'Edge Feature Columns do not match' - assert all(self.Ye.columns == self.ye.columns), 'Edge Target Columns do not match' - - + assert set(self.Xe.columns) == set(self.xe.columns), 'Edge Feature Columns do not match' + assert set(self.Ye.columns) == set(self.ye.columns), 'Edge Target Columns do not match' + + class TestFeatureProcessors(unittest.TestCase): def cases_tests(self, x, y, data_encoder, target_encoder, name, value): - import dirty_cat + import skrub self.assertIsInstance( x, pd.DataFrame, @@ -272,44 +282,40 @@ def cases_tests(self, x, y, data_encoder, target_encoder, name, value): ) self.assertIsInstance( data_encoder, - dirty_cat.super_vectorizer.SuperVectorizer, - f"Data Encoder is not a dirty_cat.super_vectorizer.SuperVectorizer instance for {name} {value}", + skrub.TableVectorizer, + f"Data Encoder is not a skrub.TableVectorizer instance for {name} {value}", ) self.assertIsInstance( target_encoder, - dirty_cat.super_vectorizer.SuperVectorizer, - f"Data Target Encoder is not a dirty_cat.super_vectorizer.SuperVectorizer instance for {name} {value}", + skrub.TableVectorizer, + f"Data Target Encoder is not a skrub.TableVectorizer instance for {name} {value}", ) @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") def test_process_node_dataframes_min_words(self): # test different target cardinality - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - for min_words in [ - 2, - 4000, - ]: # last one should skip encoding, and throw all to dirty_cat - - X_enc, y_enc, X_encs, y_encs, data_encoder, label_encoder, ordinal_pipeline, ordinal_pipeline_target, text_model, text_cols = process_nodes_dataframes( - ndf_reddit, - y=double_target_reddit, - use_scaler='none', - cardinality_threshold=40, - cardinality_threshold_target=40, - n_topics=20, - min_words=min_words, - model_name=model_avg_name, - feature_engine=resolve_feature_engine('auto') - ) - self.cases_tests(X_enc, y_enc, data_encoder, label_encoder, "min_words", min_words) + for min_words in [ + 2, + 4000, + ]: # last one should skip encoding, and throw all to skrub + + X_enc, y_enc, X_encs, y_encs, data_encoder, label_encoder, ordinal_pipeline, ordinal_pipeline_target, text_model, text_cols = process_nodes_dataframes( + ndf_reddit, + y=double_target_reddit, + use_scaler='none', + cardinality_threshold=40, + cardinality_threshold_target=40, + n_topics=20, + min_words=min_words, + model_name=model_avg_name, + feature_engine=resolve_feature_engine('auto') + ) + self.cases_tests(X_enc, y_enc, data_encoder, label_encoder, "min_words", min_words) @pytest.mark.skipif(not has_min_dependancy, reason="requires minimal feature dependencies") def test_multi_label_binarizer(self): g = graphistry.nodes(bad_df) # can take in a list of lists and convert to multiOutput - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - g2 = g.featurize(y=['list_str'], X=['src'], multilabel=True) + g2 = g.featurize(y=['list_str'], X=['src'], multilabel=True) y = g2._get_target('node') assert y.shape == (4, 4) assert sum(y.sum(1).values - np.array([1., 2., 1., 0.])) == 0 @@ -359,36 +365,34 @@ def cases_test_graph(self, g, name, value, kind="nodes", df=ndf_reddit): ) def _test_featurizations(self, g, use_cols, targets, name, kind, df): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - for cardinality in [2, 200]: - for use_ngram in [True, False]: - for use_col in use_cols: - for target in targets: - logger.debug("*" * 90) - value = [cardinality, use_ngram, target, use_col] - names = "cardinality, use_ngram, target, use_col".split(', ') - logger.debug(f"{value}") - print(f"{[k for k in zip(names, value)]}") - logger.debug("-" * 80) - if kind == 'edges' and cardinality == 2: - # GapEncoder is set to fail on small documents like our edge_df..., so we skip - continue - g2 = g.featurize( - kind=kind, - X=use_col, - y=target, - model_name=model_avg_name, - use_scaler=None, - use_scaler_target=None, - use_ngrams=use_ngram, - min_df=0.0, - max_df=1.0, - cardinality_threshold=cardinality, - cardinality_threshold_target=cardinality - ) - - self.cases_test_graph(g2, name=name, value=value, kind=kind, df=df) + for cardinality in [2, 200]: + for use_ngram in [True, False]: + for use_col in use_cols: + for target in targets: + logger.debug("*" * 90) + value = [cardinality, use_ngram, target, use_col] + names = "cardinality, use_ngram, target, use_col".split(', ') + logger.debug(f"{value}") + print(f"{[k for k in zip(names, value)]}") + logger.debug("-" * 80) + if kind == 'edges' and cardinality == 2: + # GapEncoder is set to fail on small documents like our edge_df..., so we skip + continue + g2 = g.featurize( + kind=kind, + X=use_col, + y=target, + model_name=model_avg_name, + use_scaler=None, + use_scaler_target=None, + use_ngrams=use_ngram, + min_df=0.0, + max_df=1.0, + cardinality_threshold=cardinality, + cardinality_threshold_target=cardinality + ) + + self.cases_test_graph(g2, name=name, value=value, kind=kind, df=df) @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies") diff --git a/graphistry/tests/test_gremlin.py b/graphistry/tests/test_gremlin.py index 19b42c8c04..af875f916d 100644 --- a/graphistry/tests/test_gremlin.py +++ b/graphistry/tests/test_gremlin.py @@ -1,11 +1,17 @@ # -*- coding: utf-8 -*- from typing import Iterable -import os, numpy as np, pandas as pd, pyarrow as pa, pytest, queue +import numpy as np, pandas as pd, pytest, queue from common import NoAuthTestCase from concurrent.futures import Future from mock import patch -from gremlin_python.driver.resultset import ResultSet -from gremlin_python.structure.graph import Vertex, Edge, Path + +try: + from gremlin_python.driver.resultset import ResultSet + from gremlin_python.structure.graph import Vertex, Edge, Path + has_gremlin = True +except (ImportError, ModuleNotFoundError): + has_gremlin = False + from graphistry.gremlin import ( CosmosMixin, @@ -76,7 +82,7 @@ def make_resultset(items=[]) -> Iterable: # ### Gremlin ### # - +@pytest.mark.skipif(not has_gremlin, reason="gremlin-python not installed") class TestGremlinMixin(NoAuthTestCase): def test_connect_default_off(self): tg = TG() @@ -376,6 +382,7 @@ def test_edge_to_queries_single_typed_inferred_category(self): ] == "g.v('a').addE('x').to(g.v('b')).property('v1', '2')" +@pytest.mark.skipif(not has_gremlin, reason="gremlin-python not installed") class TestCosmosMixin(NoAuthTestCase): def test_cosmos_init(self): cg = CFull( diff --git a/graphistry/tests/test_nodexl.py b/graphistry/tests/test_nodexl.py index 00a71c45c2..72f12bfac1 100644 --- a/graphistry/tests/test_nodexl.py +++ b/graphistry/tests/test_nodexl.py @@ -1,10 +1,19 @@ # -*- coding: utf-8 -*- import pandas as pd +import pytest import graphistry from common import NoAuthTestCase +try: + import openpyxl + has_openpyxl = True +except (ImportError, ModuleNotFoundError): + has_openpyxl = False + + +@pytest.mark.skipif(not has_openpyxl, reason="openpyxl not installed") class TestNodexlBindings(NoAuthTestCase): def test_from_xls_default(self): xls = pd.ExcelFile( diff --git a/graphistry/tests/test_text_utils.py b/graphistry/tests/test_text_utils.py index 5b930f553f..6de0eeb830 100644 --- a/graphistry/tests/test_text_utils.py +++ b/graphistry/tests/test_text_utils.py @@ -1,6 +1,5 @@ import pytest import unittest -import warnings import graphistry import logging @@ -21,8 +20,6 @@ logger = logging.getLogger(__name__) -warnings.filterwarnings('ignore') - class TestTextSearch(unittest.TestCase): # check to see that .fit and transform gives similar embeddings on same data @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") @@ -31,23 +28,19 @@ def setUp(self): g = graphistry.nodes(ndf_reddit) g_with_edges = graphistry.nodes(edge_df, 'src').edges(edge_df, 'src', 'dst') - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - g2 = g.umap(X=['title', 'document'], - use_ngrams=True, - ngram_range=(1, 2)) - - g3 = g.umap(X=['title'], - use_ngrams=False, - min_words=2) - - # here we just featurize since edges are given - g4 = g_with_edges.featurize(X=['textual'], - use_ngrams=False, - min_words=1.1 - ) + g2 = g.umap(X=['title', 'document'], + use_ngrams=True, + ngram_range=(1, 2)) + + g3 = g.umap(X=['title'], + use_ngrams=False, + min_words=2) + + # here we just featurize since edges are given + g4 = g_with_edges.featurize(X=['textual'], + use_ngrams=False, + min_words=1.1 + ) self.g_ngrams = g2 self.g_emb = g3 diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py index 82e4e28465..6f7cae28b3 100644 --- a/graphistry/tests/test_umap_utils.py +++ b/graphistry/tests/test_umap_utils.py @@ -1,8 +1,8 @@ from time import time from typing import Any +import warnings import pytest import unittest -import warnings import gc import graphistry @@ -45,7 +45,6 @@ logger = logging.getLogger(__name__) logging.getLogger("graphistry.umap_utils").setLevel(logging.DEBUG) -warnings.filterwarnings("ignore") # enable tests if has cudf and env didn't explicitly disable is_test_cudf = has_cudf and os.environ["TEST_CUDF"] != "0" @@ -139,7 +138,7 @@ def test_umap_kwargs_threaded(self, reddit_ndf: pd.DataFrame): assert runtime_large > 1.5 * runtime_small -class TestUMAPFitTransform(unittest.TestCase): +class TestUMAPFitTransformNodes(unittest.TestCase): # check to see that .fit and transform gives similar embeddings on same data @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") def setUp(self): @@ -148,17 +147,13 @@ def setUp(self): self.test = ndf_reddit.sample(5) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - g2 = g.umap( - y=['label', 'type'], - use_ngrams=True, - ngram_range=(1, 2), - use_scaler="robust", - cardinality_threshold=2 - ) + g2 = g.umap( + y=['label', 'type'], + use_ngrams=True, + ngram_range=(1, 2), + use_scaler="robust", + cardinality_threshold=2 + ) self.g2 = g2 fenc = g2._node_encoder @@ -171,65 +166,25 @@ def setUp(self): ndf_reddit, ndf_reddit, kind="nodes", return_graph=True ) - # do the same for edges - edge_df22 = edge_df2.copy() - edge_df22["rando"] = np.random.rand(edge_df2.shape[0]) - g = graphistry.edges(edge_df22, "src", "dst") - self.ge = g - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - g2 = g.umap( - y=['label'], - kind="edges", - use_ngrams=True, - ngram_range=(1, 2), - use_scaler=None, - use_scaler_target=None, - cardinality_threshold=2, - n_topics=4, - ) - - fenc = g2._edge_encoder - self.Xe, self.Ye = fenc.X, fenc.y - self.EMBe = g2._edge_embedding - self.embe, self.xe, self.ye = g2.transform_umap( - edge_df22, y=edge2_target_df, kind="edges", return_graph=False - ) - self.g2e = g2 - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") def test_columns_match(self): - assert set(self.X.columns) == set(self.x.columns), "Node Feature Columns do not match" - assert set(self.Y.columns) == set(self.y.columns), "Node Target Columns do not match" - assert set(self.Xe.columns) == set(self.xe.columns), "Edge Feature Columns do not match" - assert set(self.Ye.columns) == set(self.ye.columns), "Edge Target Columns do not match" + assert set(self.X.columns) == set(self.x.columns), f"Node Feature Columns do not match: {set(self.X.columns)} vs {set(self.x.columns)}" + assert set(self.Y.columns) == set(self.y.columns), f"Node Target Columns do not match: {set(self.Y.columns)} vs {set(self.y.columns)}" @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") def test_index_match(self): # nodes d = self.g2._nodes.shape[0] - de = self.g2e._edges.shape[0] assert _eq(self.gn._nodes.index, self.g2._nodes.index).sum() == d, "Node Indexes do not match" assert _eq(self.gn._nodes.index, self.EMB.index).sum() == d, "Emb Indexes do not match" assert _eq(self.gn._nodes.index, self.emb.index).sum() == d, "Transformed Emb Indexes do not match" assert _eq(self.gn._nodes.index, self.X.index).sum() == d, "Transformed Node features Indexes do not match" assert _eq(self.gn._nodes.index, self.y.index).sum() == d, "Transformed Node target Indexes do not match" - # edges - assert _eq(self.ge._edges.index, self.g2e._edges.index).sum() == de, "Edge Indexes do not match" - assert _eq(self.ge._edges.index, self.EMBe.index).sum() == de, "Edge Emb Indexes do not match" - assert _eq(self.ge._edges.index, self.embe.index).sum() == de, "Edge Transformed Emb Indexes do not match" - assert _eq(self.ge._edges.index, self.Xe.index).sum() == de, "Edge Transformed features Indexes do not match" - assert _eq(self.ge._edges.index, self.ye.index).sum() == de, "Edge Transformed target Indexes do not match" - # make sure the indexes match at transform time internally as well assert _eq(self.X.index, self.x.index).sum() == d, "Node Feature Indexes do not match" assert _eq(self.Y.index, self.y.index).sum() == d, "Node Target Indexes do not match" - assert _eq(self.Xe.index, self.xe.index).sum() == de, "Edge Feature Indexes do not match" - assert _eq(self.Ye.index, self.ye.index).sum() == de, "Edge Target Indexes do not match" @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") def test_node_index_match_in_infered_graph(self): @@ -240,14 +195,6 @@ def test_node_index_match_in_infered_graph(self): assert _eq(g3.index, self.X.index).sum() == len(g3), "Node Transformed features Indexes do not match" assert _eq(g3.index, self.y.index).sum() == len(g3), "Node Transformed target Indexes do not match" - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") - def test_edge_index_match_in_infered_graph(self): - g3 = self.g2e._edges - assert _eq(g3.index, self.EMBe.index).sum() == len(g3), "Edge Emb Indexes do not match" - assert _eq(g3.index, self.embe.index).sum() == len(g3), "Edge Transformed Emb Indexes do not match" - assert _eq(g3.index, self.Xe.index).sum() == len(g3), "Edge Transformed Node features Indexes do not match" - assert _eq(g3.index, self.ye.index).sum() == len(g3), "Edge Transformed Node target Indexes do not match" - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") def test_umap_kwargs(self): @@ -265,12 +212,8 @@ def test_umap_kwargs(self): umap_kwargs2 = {k: v + 1 for k, v in umap_kwargs.items() if k not in ['metric']} # type: ignore umap_kwargs2['metric'] = 'euclidean' g = graphistry.nodes(self.test) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - g2 = g.umap(**umap_kwargs, engine='umap_learn') - g3 = g.umap(**umap_kwargs2, engine='umap_learn') + g2 = g.umap(**umap_kwargs, engine='umap_learn') + g3 = g.umap(**umap_kwargs2, engine='umap_learn') assert g2._umap_params == umap_kwargs assert ( g2._umap_params == umap_kwargs @@ -338,6 +281,71 @@ def test_transform_umap(self): assert True + + +class TestUMAPFitTransformEdges(unittest.TestCase): + # check to see that .fit and transform gives similar embeddings on same data + @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + def setUp(self): + + self.test = ndf_reddit.sample(5) + + # do the same for edges + edge_df22 = edge_df2.copy() + edge_df22["rando"] = np.random.rand(edge_df2.shape[0]) + g = graphistry.edges(edge_df22, "src", "dst") + self.ge = g + g2 = g.umap( + y=['label'], + kind="edges", + use_ngrams=True, + ngram_range=(1, 2), + use_scaler=None, + use_scaler_target=None, + cardinality_threshold=2, + n_topics=4, + ) + + fenc = g2._edge_encoder + self.Xe, self.Ye = fenc.X, fenc.y + self.EMBe = g2._edge_embedding + self.embe, self.xe, self.ye = g2.transform_umap( + edge_df22, y=edge2_target_df, kind="edges", return_graph=False + ) + self.g2e = g2 + + + @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + def test_columns_match(self): + assert set(self.Xe.columns) == set(self.xe.columns), f"Edge Feature Columns do not match: {set(self.Xe.columns)} vs {set(self.xe.columns)}" + assert set(self.Ye.columns) == set(self.ye.columns), f"Edge Target Columns do not match: {set(self.Ye.columns)} vs {set(self.ye.columns)}" + + @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + def test_index_match(self): + # nodes + de = self.g2e._edges.shape[0] + + # edges + assert _eq(self.ge._edges.index, self.g2e._edges.index).sum() == de, "Edge Indexes do not match" + assert _eq(self.ge._edges.index, self.EMBe.index).sum() == de, "Edge Emb Indexes do not match" + assert _eq(self.ge._edges.index, self.embe.index).sum() == de, "Edge Transformed Emb Indexes do not match" + assert _eq(self.ge._edges.index, self.Xe.index).sum() == de, "Edge Transformed features Indexes do not match" + assert _eq(self.ge._edges.index, self.ye.index).sum() == de, "Edge Transformed target Indexes do not match" + + # make sure the indexes match at transform time internally as well + assert _eq(self.Xe.index, self.xe.index).sum() == de, "Edge Feature Indexes do not match" + assert _eq(self.Ye.index, self.ye.index).sum() == de, "Edge Target Indexes do not match" + + + @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + def test_edge_index_match_in_infered_graph(self): + g3 = self.g2e._edges + assert _eq(g3.index, self.EMBe.index).sum() == len(g3), "Edge Emb Indexes do not match" + assert _eq(g3.index, self.embe.index).sum() == len(g3), "Edge Transformed Emb Indexes do not match" + assert _eq(g3.index, self.Xe.index).sum() == len(g3), "Edge Transformed Node features Indexes do not match" + assert _eq(g3.index, self.ye.index).sum() == len(g3), "Edge Transformed Node target Indexes do not match" + + class TestUMAPMethods(unittest.TestCase): def _check_attributes(self, g, attributes): msg = "Graphistry instance after umap should have `{}` as attribute" @@ -446,7 +454,7 @@ def test_umap_edgecase(self): df.loc[[1,20,35,42,30], 'z'] = 1 df.loc[[10,5,16,28,35], 'z'] = 1.0 df.loc[[12,7], 'z'] = 'NaN' - df.loc[[13,8], 'z'] = np.NaN + df.loc[[13,8], 'z'] = np.nan graphistry.nodes(df).umap() assert True @@ -506,40 +514,38 @@ class TestUMAPAIMethods(TestUMAPMethods): reason="requires ai+umap feature dependencies", ) def _test_umap(self, g, use_cols, targets, name, kind, df): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - for scaler in ["kbins", "robust"]: - for cardinality in [2, 200]: - for use_ngram in [True, False]: - for use_col in use_cols: - for target in targets: - logger.debug("*" * 90) - value = [ - scaler, - cardinality, - use_ngram, - target, - use_col, - ] - logger.debug(f"{value}") - logger.debug("-" * 80) - - g2 = g.umap( - kind=kind, - X=use_col, - y=target, - model_name=model_avg_name, - use_scaler=scaler, - use_scaler_target=scaler, - use_ngrams=use_ngram, - engine="umap_learn", - cardinality_threshold=cardinality, - cardinality_threshold_target=cardinality, - n_neighbors=3, - dbscan=False, - ) - - self.cases_test_graph(g2, kind=kind, df=df) + for scaler in ["kbins", "robust"]: + for cardinality in [2, 200]: + for use_ngram in [True, False]: + for use_col in use_cols: + for target in targets: + logger.debug("*" * 90) + value = [ + scaler, + cardinality, + use_ngram, + target, + use_col, + ] + logger.debug(f"{value}") + logger.debug("-" * 80) + + g2 = g.umap( + kind=kind, + X=use_col, + y=target, + model_name=model_avg_name, + use_scaler=scaler, + use_scaler_target=scaler, + use_ngrams=use_ngram, + engine="umap_learn", + cardinality_threshold=cardinality, + cardinality_threshold_target=cardinality, + n_neighbors=3, + dbscan=False, + ) + + self.cases_test_graph(g2, kind=kind, df=df) @pytest.mark.skipif( not has_dependancy or not has_umap, @@ -550,19 +556,14 @@ def test_node_umap(self): use_cols = [None, text_cols_reddit, good_cols_reddit, meta_cols_reddit] targets = [None, single_target_reddit, double_target_reddit] - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - - self._test_umap( - g, - use_cols=use_cols, - targets=targets, - name="Node UMAP with `(target, use_col)=`", - kind="nodes", - df=ndf_reddit, - ) + self._test_umap( + g, + use_cols=use_cols, + targets=targets, + name="Node UMAP with `(target, use_col)=`", + kind="nodes", + df=ndf_reddit, + ) @pytest.mark.skipif( not has_dependancy or not has_umap, @@ -572,19 +573,14 @@ def test_edge_umap(self): g = graphistry.edges(edge_df2, "src", "dst") targets = [None, "label"] use_cols = [None, "title"] - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - - self._test_umap( - g, - use_cols=use_cols, - targets=targets, - name="Edge UMAP with `(target, use_col)=`", - kind="edges", - df=edge_df2, - ) + self._test_umap( + g, + use_cols=use_cols, + targets=targets, + name="Edge UMAP with `(target, use_col)=`", + kind="edges", + df=edge_df2, + ) @pytest.mark.skipif( not has_dependancy or not has_umap, @@ -599,7 +595,9 @@ def test_chaining_nodes(self): logger.debug("======= g3a.featurize() done ======") g3 = g3a.umap(dbscan=False) logger.debug("======= g3.umap() done ======") - assert g2._node_features.shape == g3._node_features.shape + assert len(g2._node_features) == len(g3._node_features) + assert g2._node_features.shape[1] < g3._node_features.shape[1] + # since g3 has feature params with x and y. g3._feature_params["nodes"]["X"].pop("x") g3._feature_params["nodes"]["X"].pop("y") @@ -607,7 +605,14 @@ def test_chaining_nodes(self): assert ( g2._feature_params["nodes"]["y"].shape == g3._feature_params["nodes"]["y"].shape ) # None - assert g2._node_embedding.shape == g3._node_embedding.shape # kinda weak sauce + assert len(g2._node_embedding) == len(g3._node_embedding) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) + try: + assert set(g2._node_embedding.columns) == set(g3._node_embedding.columns) + except AssertionError: + warnings.warn("Columns do not match", UserWarning) @pytest.mark.skipif( not has_dependancy or not has_umap, @@ -615,12 +620,8 @@ def test_chaining_nodes(self): ) def test_chaining_edges(self): g = graphistry.edges(edge_df, "src", "dst") - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - g2 = g.umap(kind="edges", dbscan=False) - g3 = g.featurize(kind="edges").umap(kind="edges", dbscan=False) + g2 = g.umap(kind="edges", dbscan=False) + g3 = g.featurize(kind="edges").umap(kind="edges", dbscan=False) assert all(g2._feature_params["edges"]["X"] == g3._feature_params["edges"]["X"]) assert all( @@ -636,20 +637,15 @@ def test_feature_kwargs_yield_different_values_using_umap_api(self): g = graphistry.nodes(ndf_reddit) n_topics_target = 6 - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - - g2 = g.umap( - X="type", - y="label", - cardinality_threshold_target=3, - n_topics_target=n_topics_target, - ) # makes a GapEncoded Target - g3 = g.umap( - X="type", y="label", cardinality_threshold_target=30000 - ) # makes a one-hot-encoded target + g2 = g.umap( + X="type", + y="label", + cardinality_threshold_target=3, + n_topics_target=n_topics_target, + ) # makes a GapEncoded Target + g3 = g.umap( + X="type", y="label", cardinality_threshold_target=30000 + ) # makes a one-hot-encoded target assert all( g2._feature_params["nodes"]["X"] == g3._feature_params["nodes"]["X"] @@ -752,39 +748,37 @@ def teardown_class(cls: Any) -> None: reason="requires cuml feature dependencies", ) def _test_umap(self, g, use_cols, targets, name, kind, df): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - for scaler in ["kbins", "robust"]: - for cardinality in [2, 200]: - for use_ngram in [True, False]: - for use_col in use_cols: - for target in targets: - logger.debug("*" * 90) - value = [ - scaler, - cardinality, - use_ngram, - target, - use_col, - ] - logger.debug(f"{name}:\n{value}") - logger.debug("-" * 80) - - g2 = g.umap( - kind=kind, - X=use_col, - y=target, - model_name=model_avg_name, - use_scaler=scaler, - use_scaler_target=scaler, - use_ngrams=use_ngram, - engine="cuml", - cardinality_threshold=cardinality, - cardinality_threshold_target=cardinality, - n_neighbors=3, - ) - - self.cases_test_graph(g2, kind=kind, df=df) + for scaler in ["kbins", "robust"]: + for cardinality in [2, 200]: + for use_ngram in [True, False]: + for use_col in use_cols: + for target in targets: + logger.debug("*" * 90) + value = [ + scaler, + cardinality, + use_ngram, + target, + use_col, + ] + logger.debug(f"{name}:\n{value}") + logger.debug("-" * 80) + + g2 = g.umap( + kind=kind, + X=use_col, + y=target, + model_name=model_avg_name, + use_scaler=scaler, + use_scaler_target=scaler, + use_ngrams=use_ngram, + engine="cuml", + cardinality_threshold=cardinality, + cardinality_threshold_target=cardinality, + n_neighbors=3, + ) + + self.cases_test_graph(g2, kind=kind, df=df) @pytest.mark.skipif( not has_dependancy or not has_cuml, @@ -799,19 +793,14 @@ def test_node_umap(self): continue targets[i] = target[:len(g._nodes)].reset_index(drop=True) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - - self._test_umap( - g, - use_cols=use_cols, - targets=targets, - name="Node UMAP with `(target, use_col)=`", - kind="nodes", - df=g._nodes, - ) + self._test_umap( + g, + use_cols=use_cols, + targets=targets, + name="Node UMAP with `(target, use_col)=`", + kind="nodes", + df=g._nodes, + ) @pytest.mark.skipif( not has_dependancy or not has_cuml, @@ -821,19 +810,14 @@ def test_edge_umap(self): g = graphistry.edges(edge_df2, "src", "dst") targets = [None, "label"] use_cols = [None, "title"] - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - - self._test_umap( - g, - use_cols=use_cols, - targets=targets, - name="Edge UMAP with `(target, use_col)=`", - kind="edges", - df=edge_df2, - ) + self._test_umap( + g, + use_cols=use_cols, + targets=targets, + name="Edge UMAP with `(target, use_col)=`", + kind="edges", + df=edge_df2, + ) @pytest.mark.skipif( not has_dependancy or not has_cuml, @@ -848,7 +832,8 @@ def test_chaining_nodes(self): logger.debug("======= g3a.featurize() done ======") g3 = g3a.umap() logger.debug("======= g3.umap() done ======") - assert g2._node_features.shape == g3._node_features.shape, f"featurize() should be idempotent, found {g2._node_features.shape} != {g3._node_features.shape}" + assert g2._node_features.shape[0] == g3._node_features.shape[0] + assert g2._node_features.shape[1] < g3._node_features.shape[1] # since g3 has feature params with x and y. g3._feature_params["nodes"]["X"].pop("x") g3._feature_params["nodes"]["X"].pop("y") @@ -864,12 +849,8 @@ def test_chaining_nodes(self): ) def test_chaining_edges(self): g = graphistry.edges(edge_df, "src", "dst") - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - g2 = g.umap(kind="edges") - g3 = g.featurize(kind="edges").umap(kind="edges") + g2 = g.umap(kind="edges") + g3 = g.featurize(kind="edges").umap(kind="edges") assert all(g2._feature_params["edges"]["X"] == g3._feature_params["edges"]["X"]) assert all( @@ -885,20 +866,15 @@ def test_feature_kwargs_yield_different_values_using_umap_api(self): g = graphistry.nodes(ndf_reddit) n_topics_target = 6 - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) - - g2 = g.umap( - X="type", - y="label", - cardinality_threshold_target=3, - n_topics_target=n_topics_target, - ) # makes a GapEncoded Target - g3 = g.umap( - X="type", y="label", cardinality_threshold_target=30000 - ) # makes a one-hot-encoded target + g2 = g.umap( + X="type", + y="label", + cardinality_threshold_target=3, + n_topics_target=n_topics_target, + ) # makes a GapEncoded Target + g3 = g.umap( + X="type", y="label", cardinality_threshold_target=30000 + ) # makes a one-hot-encoded target assert all( g2._feature_params["nodes"]["X"] == g3._feature_params["nodes"]["X"] @@ -943,8 +919,8 @@ def setUp(self): @pytest.mark.skipif(not has_dependancy or not has_cuml, reason="requires cuml dependencies") @pytest.mark.skipif(not is_test_cudf, reason="requires cudf") def test_base(self): - graphistry.nodes(self.df).umap('auto')._node_embedding.shape == (self.samples, 2) - graphistry.nodes(self.df).umap('engine')._node_embedding.shape == (self.samples, 2) + graphistry.nodes(self.df).umap(engine='auto')._node_embedding.shape == (self.samples, 2) + graphistry.nodes(self.df).umap(engine='cuml')._node_embedding.shape == (self.samples, 2) if __name__ == "__main__": diff --git a/graphistry/text_utils.py b/graphistry/text_utils.py index 1d83cb7a4e..0a786fe011 100644 --- a/graphistry/text_utils.py +++ b/graphistry/text_utils.py @@ -1,13 +1,11 @@ +from typing import TYPE_CHECKING +from inspect import getmodule +from logging import getLogger import pandas as pd from .feature_utils import FeatureMixin from .ai_utils import search_to_df, FaissVectorSearch from .constants import WEIGHT, DISTANCE -from logging import getLogger - -from typing import ( - TYPE_CHECKING, -) # noqa if TYPE_CHECKING: @@ -43,6 +41,8 @@ def build_index(self, angular=False, n_trees=None): self.assert_fitted() self.assert_features_line_up_with_nodes() X = self._get_feature("nodes") + if 'cudf' in str(getmodule(X)): + X = X.to_pandas() self.search_index = FaissVectorSearch( X.values ) # self._build_search_index(X, angular, n_trees, faiss=False) @@ -51,6 +51,10 @@ def _query_from_dataframe(self, qdf: pd.DataFrame, top_n: int, thresh: float): # Use the loaded featurizers to transform the dataframe vect, _ = self.transform(qdf, None, kind="nodes", return_graph=False) + nodes = self._nodes + if 'cudf' in str(getmodule(nodes)): + nodes = nodes.to_pandas() + results = self.search_index.search_df(vect, self._nodes, top_n) results = results.query(f"{DISTANCE} < {thresh}") @@ -76,7 +80,7 @@ def _query(self, query: str, top_n: int, thresh: float): for col in cols_text[1:]: qdf[col] = [""] - # this is hookey and needs to be fixed on dirty_cat side (with errors='ignore') + # this is hookey and needs to be fixed on skrub side (with errors='ignore') # if however min_words = 0, all columns will be textual, # and no other data_encoder will be generated if hasattr(self._node_encoder.data_encoder, "columns_"): # type: ignore @@ -213,15 +217,40 @@ def search_graph( # print('shape of edges', edf.shape) rdf = df = res._nodes # print('shape of nodes', rdf.shape) + + if 'cudf' in str(getmodule(edges)): + import cudf + + if not isinstance(rdf, cudf.DataFrame): + rdf = cudf.from_pandas(rdf) + df = rdf + + concat = cudf.concat + cudf_coercion = True + else: + concat = pd.concat + cudf_coercion = False + node = res._node indices = rdf[node] + if cudf_coercion: + import cudf + if not isinstance(indices, cudf.Series): + indices = cudf.Series.from_pandas(indices) src = res._source dst = res._destination if query != "": # run a real query, else return entire graph rdf, _ = res.search(query, thresh=thresh, fuzzy=True, top_n=top_n) if not rdf.empty: + if cudf_coercion: + import cudf + #if not isinstance(indices, cudf.Series): + # indices = cudf.Series.from_pandas(indices) + if not isinstance(rdf, cudf.DataFrame): + rdf = cudf.from_pandas(rdf) indices = rdf[node] + # now get edges from indices if broader: # this will make a broader graph, finding NN in src OR dst edges = edf[(edf[src].isin(indices)) | (edf[dst].isin(indices))] @@ -239,19 +268,35 @@ def search_graph( except: # for explicit edges pass - found_indices = pd.concat([edges[src], edges[dst], indices], axis=0).unique() + #logger.info('type edges=%s, indices=%s', type(edges), type(indices)) + #raise ValueError(f'stop here: {type(edges)}, {type(indices)}') + + found_indices = concat([edges[src], edges[dst], indices], axis=0).unique() emb = None + node_feats = res._node_features + if cudf_coercion: + import cudf + if not isinstance(node_feats, cudf.DataFrame): + node_feats = cudf.from_pandas(node_feats) + + node_emb = res._node_embedding + if cudf_coercion and res._umap is not None: + import cudf + node_emb = res._node_embedding + if not isinstance(node_emb, cudf.DataFrame): + node_emb = cudf.from_pandas(node_emb) + try: tdf = rdf.iloc[found_indices] - feats = res._node_features.iloc[found_indices] # type: ignore + feats = node_feats.iloc[found_indices] # type: ignore if res._umap is not None: - emb = res._node_embedding.iloc[found_indices] # type: ignore + emb = node_emb.iloc[found_indices] # type: ignore except Exception: # for explicit relabeled nodes #logger.exception(e) tdf = rdf[df[node].isin(found_indices)] - feats = res._node_features.loc[tdf.index] # type: ignore + feats = node_feats.loc[tdf.index] # type: ignore if res._umap is not None: - emb = res._node_embedding[df[node].isin(found_indices)] # type: ignore + emb = node_emb[df[node].isin(found_indices)] # type: ignore logger.info(f" - Returning edge dataframe of size {edges.shape[0]}") # get all the unique nodes logger.info( diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py index 5b963b1f1d..6c14de5e32 100644 --- a/graphistry/umap_utils.py +++ b/graphistry/umap_utils.py @@ -2,10 +2,14 @@ from time import time from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union from inspect import getmodule +import warnings import numpy as np import pandas as pd +from graphistry.Engine import Engine, df_to_engine +from graphistry.models.compute.features import GraphEntityKind +from graphistry.models.compute.umap import UMAPEngine, UMAPEngineConcrete, umap_engine_values from graphistry.utils.lazy_import import ( lazy_cudf_import, lazy_umap_import, @@ -13,7 +17,7 @@ ) from . import constants as config from .constants import CUML, UMAP_LEARN -from .feature_utils import (FeatureMixin, Literal, XSymbolic, YSymbolic, +from .feature_utils import (FeatureMixin, XSymbolic, YSymbolic, resolve_feature_engine) from .PlotterBase import Plottable, WeakValueDictionary from .util import check_set_memoize, setup_logger @@ -59,14 +63,11 @@ def is_legacy_cuml(): return False -UMAPEngineConcrete = Literal['cuml', 'umap_learn'] -UMAPEngine = Literal[UMAPEngineConcrete, "auto"] - - def resolve_umap_engine( engine: UMAPEngine, -) -> UMAPEngineConcrete: # noqa - if engine in [CUML, UMAP_LEARN]: +) -> UMAPEngineConcrete: + + if engine in umap_engine_values: return engine # type: ignore if engine in ["auto"]: has_cuml_dependancy_, _, _ = lazy_cuml_import() @@ -76,39 +77,63 @@ def resolve_umap_engine( if has_umap_dependancy_: return 'umap_learn' - raise ValueError( # noqa + raise ValueError( f'engine expected to be "auto", ' '"umap_learn", or "cuml" ' f"but received: {engine} :: {type(engine)}" ) -def make_safe_gpu_dataframes(X, y, engine): +def umap_model_to_engine(v: Any) -> Optional[UMAPEngineConcrete]: + + if v is None: + return None + + try: + from umap import UMAP + if isinstance(v, UMAP): + return 'umap_learn' + except (ModuleNotFoundError, ImportError): + pass + + try: + from cuml import UMAP + if isinstance(v, UMAP): + return 'cuml' + except (ModuleNotFoundError, ImportError): + pass + + raise ValueError(f"Unknown UMAP engine: {v}") + + +def make_safe_umap_gpu_dataframes( + X: pd.DataFrame, y: Optional[pd.DataFrame], engine: UMAPEngineConcrete +) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: + + if engine not in ["umap_learn", "cuml"]: + raise ValueError(f"Expected engine to be umap_learn or cuml, got {engine}") def safe_cudf(X, y): - # remove duplicate columns - if len(X.columns) != len(set(X.columns)): - X = X.loc[:, ~X.columns.duplicated()] - try: - y = y.loc[:, ~y.columns.duplicated()] - except: - pass + import cudf + + #if y is not None and normalize: + # X, y = normalize_X_y(X, y) + # logger.debug('Normalized X: %s %s', X.dtypes, y.dtypes if y is not None else None) new_kwargs = {} kwargs = {'X': X, 'y': y} for key, value in kwargs.items(): - if isinstance(value, cudf.DataFrame) and engine in ["pandas", "umap_learn", "dirty_cat"]: - new_kwargs[key] = value.to_pandas() - elif isinstance(value, pd.DataFrame) and engine in ["cuml", "cu_cat"]: - new_kwargs[key] = cudf.from_pandas(value) - else: - new_kwargs[key] = value + if value is None: + new_kwargs[key] = None + elif engine == "umap_learn": + new_kwargs[key] = df_to_engine(value, Engine.PANDAS) + elif engine == 'cuml': + new_kwargs[key] = df_to_engine(value, Engine.CUDF) return new_kwargs['X'], new_kwargs['y'] - has_cudf_dependancy_, _, cudf = lazy_cudf_import() - if has_cudf_dependancy_: + if 'cudf' in str(getmodule(X)) or (y is not None and 'cudf' in str(getmodule(y))) or engine == "cuml": return safe_cudf(X, y) - else: - return X, y + + return X, y ############################################################################### @@ -119,11 +144,19 @@ def safe_cudf(X, y): # ############################################################################# -def reuse_umap(g: Plottable, memoize: bool, metadata: Any): # noqa: C901 - return check_set_memoize( +def reuse_umap(g: Plottable, memoize: bool, metadata: Any) -> Optional[Plottable]: + o = check_set_memoize( g, metadata, attribute="_umap_param_to_g", name="umap", memoize=memoize ) + if o is False: + return None + + if isinstance(o, Plottable): + return o + + raise ValueError(f'Expected Plottable or False, got {type(o)}') + def umap_graph_to_weighted_edges(umap_graph, engine: UMAPEngineConcrete, is_legacy, cfg=config): logger.debug("Calculating weighted adjacency (edge) DataFrame") @@ -194,13 +227,12 @@ class UMAPMixin(MIXIN_BASE): def __init__(self, *args, **kwargs): #self._umap_initialized = False - #self.engine = self.engine if hasattr(self, "engine") else None pass def umap_lazy_init( self, - res, + res: Plottable, n_neighbors: int = 12, min_dist: float = 0.1, spread: float = 0.5, @@ -215,7 +247,7 @@ def umap_lazy_init( umap_fit_kwargs: Dict[str, Any] = {}, umap_transform_kwargs: Dict[str, Any] = {}, ): - from graphistry.features import ModelDict + from graphistry.models.ModelDict import ModelDict engine_resolved = resolve_umap_engine(engine) # FIXME remove as set_new_kwargs will always replace? @@ -251,10 +283,12 @@ def umap_lazy_init( logger.debug('lazy init') # set new umap kwargs + res._umap_engine = engine_resolved res._umap_params = umap_params res._umap_fit_kwargs = umap_fit_kwargs res._umap_transform_kwargs = umap_transform_kwargs + #assert isinstance(res, UMAPMixin) res._n_components = n_components res._metric = metric res._n_neighbors = n_neighbors @@ -265,7 +299,6 @@ def umap_lazy_init( res._negative_sample_rate = negative_sample_rate res._umap = umap_engine.UMAP(**umap_params) logger.debug('Initialized UMAP with params: %s', umap_params) - res.engine = engine_resolved res._suffix = suffix return res @@ -306,7 +339,9 @@ def umap_fit( logger.info("-" * 90) logger.info(f"Starting UMAP-ing data of shape {X.shape}") - if self.engine == CUML and is_legacy_cuml(): # type: ignore + engine = umap_model_to_engine(self._umap) + + if engine == CUML and is_legacy_cuml(): # type: ignore from cuml.neighbors import NearestNeighbors knn = NearestNeighbors(n_neighbors=self._n_neighbors) # type: ignore @@ -319,7 +354,7 @@ def umap_fit( self._weighted_adjacency = self._umap.graph_ # if changing, also update fresh_res self._weighted_edges_df = umap_graph_to_weighted_edges( - self._umap.graph_, self.engine, is_legacy_cuml() # type: ignore + self._umap.graph_, engine, is_legacy_cuml() # type: ignore ) mins = (time() - t) / 60 @@ -338,24 +373,27 @@ def _umap_fit_transform( if self._umap is None: raise ValueError("UMAP is not initialized") self.umap_fit(X, y, umap_fit_kwargs) - logger.debug('_umap_fit_transform:\nX::%s\n%s\n%s\nkwargs:\n%s\ny:\n%s', type(X), X.dtypes, X, umap_transform_kwargs, y) - #logger.debug('per col types: %s', {k: (type(X[k]), X[k].dtype) for k in X.columns}) - try: - logger.debug('X as pandas', X.to_pandas()) # type: ignore - except: - pass + logger.debug('_umap_fit_transform:\nX::%s\n%s\nkwargs:\n%s\ny:\n%s', type(X), X.dtypes, umap_transform_kwargs, y.dtypes if y is not None else None) emb = self._umap.transform(X, **umap_transform_kwargs) + + engine = umap_model_to_engine(self._umap) + if engine == CUML: + import cudf + assert isinstance(emb, cudf.DataFrame), f'Expected cudf.DataFrame, got {type(emb)}' + elif engine == UMAP_LEARN: + assert isinstance(emb, np.ndarray), f'Expected np.ndarray, got {type(emb)}' + emb = self._bundle_embedding(emb, index=X.index) return emb - def transform_umap(self, df: pd.DataFrame, - y: Optional[pd.DataFrame] = None, - kind: str = 'nodes', - min_dist: Union[str, float, int] = 'auto', + def transform_umap(self, df: pd.DataFrame, + y: Optional[pd.DataFrame] = None, + kind: GraphEntityKind = 'nodes', + min_dist: Union[str, float, int] = 'auto', n_neighbors: int = 7, merge_policy: bool = False, - sample: Optional[int] = None, + sample: Optional[int] = None, return_graph: bool = True, fit_umap_embedding: bool = True, umap_transform_kwargs: Dict[str, Any] = {} @@ -375,14 +413,19 @@ def transform_umap(self, df: pd.DataFrame, return_graph: Whether to return a graph or just the embeddings fit_umap_embedding: Whether to infer graph from the UMAP embedding on the new data, default True """ - df, y = make_safe_gpu_dataframes(df, y, 'pandas') + + engine = self._umap_engine + assert engine is not None, f'Expected self._umap_engine to be resolved, got {engine}' + + df, y = make_safe_umap_gpu_dataframes(df, y, engine) X, y_ = self.transform(df, y, kind=kind, return_graph=False) - X, y_ = make_safe_gpu_dataframes(X, y_, self.engine) # type: ignore + X, y_ = make_safe_umap_gpu_dataframes(X, y_, engine) # type: ignore + assert self._umap is not None, 'Expected self._umap to be initialized' emb = self._umap.transform(X, **umap_transform_kwargs) # type: ignore emb = self._bundle_embedding(emb, index=df.index) - if return_graph and kind not in ["edges"]: - emb, _ = make_safe_gpu_dataframes(emb, None, 'pandas') # for now so we don't have to touch infer_edges, force to pandas - X, y_ = make_safe_gpu_dataframes(X, y_, 'pandas') + if return_graph and kind == 'nodes': + emb, _ = make_safe_umap_gpu_dataframes(emb, None, engine) # for now so we don't have to touch infer_edges, force to pandas + X, y_ = make_safe_umap_gpu_dataframes(X, y_, engine) g = self._infer_edges(emb, X, y_, df, infer_on_umap_embedding=fit_umap_embedding, merge_policy=merge_policy, eps=min_dist, sample=sample, n_neighbors=n_neighbors) @@ -391,26 +434,46 @@ def transform_umap(self, df: pd.DataFrame, def _bundle_embedding(self, emb, index): # Converts Embedding into dataframe and takes care if emb.dim > 2 - if emb.shape[1] == 2 and 'cudf.core.dataframe' not in str(getmodule(emb)) and not hasattr(emb, 'device'): - emb = pd.DataFrame(emb, columns=[config.X, config.Y], index=index) - elif emb.shape[1] == 2 and 'cudf.core.dataframe' in str(getmodule(emb)): - emb.rename(columns={0: config.X, 1: config.Y}, inplace=True) - elif emb.shape[1] == 2 and hasattr(emb, 'device'): + + engine = umap_model_to_engine(self._umap) + + if engine == CUML: import cudf - emb = cudf.DataFrame(emb, columns=[config.X, config.Y], index=index) - else: - columns = [config.X, config.Y] + [ - f"umap_{k}" for k in range(2, emb.shape[1]) - ] - if 'cudf.core.dataframe' not in str(getmodule(emb)): - emb = pd.DataFrame(emb, columns=columns, index=index) - elif 'cudf.core.dataframe' in str(getmodule(emb)): - emb.columns = columns + if not isinstance(emb, cudf.DataFrame): + warnings.warn(f'Expected cudf.DataFrame, trying to convert from {type(emb)}') + if isinstance(emb, pd.DataFrame): + emb = cudf.DataFrame.from_pandas(emb) + else: + emb = cudf.DataFrame(emb) + if emb.shape[1] == 2: + emb.rename(columns={0: config.X, 1: config.Y}, inplace=True) + return emb + elif engine == UMAP_LEARN: + if 'cudf.core.dataframe' in str(getmodule(emb)): + warnings.warn(f'cudf detected, but not imported, will try to convert to pandas: type={type(emb)}') + emb = emb.to_pandas() + #raise ValueError(f'Did not expect cudf value for sklearn engine, emb type: {type(emb)}') + if emb.shape[1] == 2 and 'cudf.core.dataframe' not in str(getmodule(emb)) and not hasattr(emb, 'device'): + return pd.DataFrame(emb, columns=[config.X, config.Y], index=index) + elif emb.shape[1] == 2 and hasattr(emb, 'device') and emb.device == 'cuda': + try: + import cudf + emb = cudf.DataFrame(emb, columns=[config.X, config.Y], index=index).to_pandas() + except (ModuleNotFoundError, ImportError): + pass + + columns = [config.X, config.Y] + [ + f"umap_{k}" for k in range(2, emb.shape[1]) + ] + if 'cudf.core.dataframe' not in str(getmodule(emb)): + emb = pd.DataFrame(emb, columns=columns, index=index) + elif 'cudf.core.dataframe' in str(getmodule(emb)): + emb.columns = columns return emb def _process_umap( self, - res, + res: 'UMAPMixin', X_: pd.DataFrame, y_: pd.DataFrame, kind, @@ -441,7 +504,7 @@ def _process_umap( old_res = reuse_umap(res, memoize, umap_kwargs_reuse) if old_res: logger.debug(" --- [[ RE-USING UMAP ]], umap previous n_components: %s", umap_params['n_components']) - fresh_res = copy.copy(res) + fresh_res: UMAPMixin = copy.copy(res) for attr in ["_xy", "_weighted_edges_df", "_weighted_adjacency"]: if hasattr(old_res, attr): setattr(fresh_res, attr, getattr(old_res, attr)) @@ -465,7 +528,7 @@ def _process_umap( return res def _set_features( # noqa: E303 - self, res, X, y, kind, feature_engine, featurize_kwargs + self, res: 'UMAPMixin', X, y, kind, feature_engine, featurize_kwargs ): """ Helper for setting features for memoize @@ -575,7 +638,7 @@ def umap( :engine: selects which engine to use to calculate UMAP: default "auto" will use cuML if available, otherwise UMAP-LEARN. :feature_engine: How to encode data - ("none", "auto", "pandas", "dirty_cat", "torch") + ("none", "auto", "pandas", "skrub", "torch") :inplace: bool = False, whether to modify the current object, default False. when False, returns a new object, useful for chaining in a functional paradigm. :memoize: whether to memoize the results of this method, @@ -609,30 +672,11 @@ def umap( ) logger.debug("umap_kwargs: %s", umap_kwargs_combined) - # temporary until we have full cudf support in feature_utils.py - has_cudf, _, cudf = lazy_cudf_import() - if inplace: res = self else: res = self.bind() - if has_cudf: - flag_nodes_cudf = isinstance(self._nodes, cudf.DataFrame) - flag_edges_cudf = isinstance(self._edges, cudf.DataFrame) - - #if flag_nodes_cudf or flag_edges_cudf: - if False: - if flag_nodes_cudf: - res._nodes = res._nodes.to_pandas() - if flag_edges_cudf: - res._edges = res._edges.to_pandas() - if (X is not None) or (y is not None): - res = res.umap(X=X, y=y, kind=kind, feature_engine=feature_engine, **umap_kwargs_combined, **featurize_kwargs) # type: ignore - else: - res = res.umap(X=self._nodes, y=self._edges, kind=kind, feature_engine=feature_engine, **umap_kwargs_combined, **featurize_kwargs) # type: ignore - return res - res = res.umap_lazy_init( res, n_neighbors=n_neighbors, @@ -649,6 +693,8 @@ def umap( umap_fit_kwargs=umap_fit_kwargs, umap_transform_kwargs=umap_transform_kwargs ) + engine_resolved = res._umap_engine + assert engine_resolved is not None, f'Expected engine to be resolved, got {engine_resolved}' logger.debug("umap input X :: %s", X) logger.debug("umap input y :: %s", y) @@ -680,19 +726,19 @@ def umap( **featurize_kwargs ) - logger.debug("umap X_ (%s): %s", type(X_), X_) - logger.debug("umap y_ (%s): %s", type(y_), y_) + logger.debug("umap X_ (%s): %s", type(X_), X_.columns) + logger.debug("umap y_ (%s): %s", type(y_), y_.columns) logger.debug("data is type :: %s", (type(X_))) if isinstance(X_, pd.DataFrame): index_to_nodes_dict = dict(zip(range(len(nodes)), nodes)) elif 'cudf.core.dataframe' in str(getmodule(X_)): + import cudf assert isinstance(X_, cudf.DataFrame) logger.debug('nodes type: %s', type(nodes)) import cupy as cp index_to_nodes_dict = dict(zip(range(len(nodes)), cp.asnumpy(nodes))) - # add the safe coercion here - X_, y_ = make_safe_gpu_dataframes(X_, y_, res.engine) # type: ignore + X_, y_ = make_safe_umap_gpu_dataframes(X_, y_, engine_resolved) # type: ignore res = res._process_umap( res, X_, y_, kind, memoize, featurize_kwargs, **umap_kwargs_combined @@ -722,7 +768,7 @@ def umap( ) # add the safe coercion here - X_, y_ = make_safe_gpu_dataframes(X_, y_, res.engine) # type: ignore + X_, y_ = make_safe_umap_gpu_dataframes(X_, y_, engine_resolved) # type: ignore res = res._process_umap( res, X_, y_, kind, memoize, featurize_kwargs, **umap_kwargs_combined @@ -767,7 +813,7 @@ def umap( res, kind, encode_position, encode_weight, play ) # noqa: E501 - if res.engine == CUML and is_legacy_cuml(): # type: ignore + if engine_resolved == CUML and is_legacy_cuml(): # type: ignore res = res.prune_self_edges() if dbscan: diff --git a/graphistry/util.py b/graphistry/util.py index c2c47996f1..0525a6f1ee 100644 --- a/graphistry/util.py +++ b/graphistry/util.py @@ -1,5 +1,5 @@ +from typing import Any, Union import hashlib -import json import logging import os import pandas as pd @@ -10,9 +10,9 @@ import uuid import warnings from functools import lru_cache -from typing import Any -from collections import UserDict +from graphistry.models.ModelDict import ModelDict +from graphistry.Plottable import Plottable from .constants import VERBOSE, CACHE_COERCION_SIZE, TRACE @@ -132,8 +132,8 @@ def hash_memoize(v: Any) -> str: def check_set_memoize( - g, metadata, attribute, name: str = "", memoize: bool = True -): # noqa: C901 + g: Plottable, metadata, attribute: str, name: str = "", memoize: bool = True +) -> Union[bool, Any]: """ Helper Memoize function that checks if metadata args have changed for object g -- which is unconstrained save for the fact that it must have `attribute`. If they have not changed, will return memoized version, @@ -309,65 +309,6 @@ def deprecated_func(*args, **kwargs): # ############################################################################# -# MODEL Parameter HELPERS -def get_timestamp(): - import datetime - - return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - -class ModelDict(UserDict): - """Helper class to print out model names and keep track of updates - - Args: - message: description of model - verbose: print out model names, logging happens regardless - """ - - def __init__(self, message, verbose=True, _timestamp=False, *args, **kwargs): - self._message = message - self._verbose = verbose - self._timestamp = _timestamp # do no use this inside the class, as it will trigger memoization. Only use outside of class. - L = ( - len(message) - if _timestamp is False - else max(len(message), len(get_timestamp()) + 1) - ) - self._print_length = min(80, L) - self._updates = [] - super().__init__(*args, **kwargs) - - def print(self, message): - if self._timestamp: - message = f"{message}\n{get_timestamp()}" - if self._verbose: - print("_" * self._print_length) - print() - print(message) - print("_" * self._print_length) - print() - - def __repr__(self): - # logger.info(self._message) - self.print(self._message) - return super().__repr__() - - # def __setitem__(self, key, value): # can't get this to work properly as it doesn't get called on update - # self._updates.append({key: value}) - # if len(self._updates) > 1: - # self._message += ( - # "\n" + "_" * self._print_length + f"\n\nUpdated: {self._updates[-1]}" - # ) - # return super().__setitem__(key, value) - - def update(self, *args, **kwargs): - self._updates.append(args[0]) - if len(self._updates) > 1: # don't take first update since its the init/default - self._message += ( - "\n" + "_" * self._print_length + f"\n\nUpdated: {self._updates[-1]}" - ) - return super().update(*args, **kwargs) - def is_notebook(): """Check if running in a notebook""" @@ -391,31 +332,3 @@ def printmd(string, color=None, size=20): from IPython.display import Markdown, display colorstr = "{}".format(color, size, string) display(Markdown(colorstr)) - -# -# def inspect_decorator(func, args, kwargs): -# import inspect -# frame = inspect.currentframe() -# args, _, _, values = inspect.getargvalues(frame) -# func_name = inspect.getframeinfo(frame)[2] -# print(f'function name "{func_name}"') -# for i in args: -# print(" %s = %s" % (i, values[i])) -# return [(i, values[i]) for i in args] -# -# -# # custom decorator -# def showargs_decorator(func): -# import functools -# # updates special attributes e.g. __name__, __doc__ -# @functools.wraps(func) -# def wrapper(*args, **kwargs): -# -# # call custom inspection logic -# inspect_decorator(func, args, kwargs) -# -# # calls original function -# func(*args, **kwargs) -# -# # matches name of inner function -# return wrapper diff --git a/graphistry/utils/lazy_import.py b/graphistry/utils/lazy_import.py index f7de35bdbf..6f56a92127 100644 --- a/graphistry/utils/lazy_import.py +++ b/graphistry/utils/lazy_import.py @@ -78,6 +78,17 @@ def lazy_dirty_cat_import(): logger.warn('Unexpected exn during lazy import', exc_info=e) return False, e, None +def lazy_skrub_import(): + warnings.filterwarnings("ignore") + try: + import skrub + return True, 'ok', skrub + except ModuleNotFoundError as e: + return False, e, None + except Exception as e: + logger.warn('Unexpected exn during lazy import', exc_info=e) + return False, e, None + def lazy_embed_import(): try: import torch diff --git a/mypy.ini b/mypy.ini index cf448f8b72..fbec922048 100644 --- a/mypy.ini +++ b/mypy.ini @@ -83,6 +83,9 @@ ignore_missing_imports = True [mypy-squarify.*] ignore_missing_imports = True +[mypy-skrub.*] +ignore_missing_imports = True + [mypy-torch.*] ignore_missing_imports = True diff --git a/setup.py b/setup.py index 06b6caa43f..935d2f7c8f 100755 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def unique_flatten_dict(d): 'pandas-stubs', 'types-requests', 'ipython', 'tqdm-stubs' ] -test_workarounds = ['scikit-learn<=1.3.2'] +test_workarounds = [] dev_extras = { 'docs': [ @@ -56,11 +56,18 @@ def unique_flatten_dict(d): } base_extras_heavy = { - 'umap-learn': ['umap-learn', 'dirty-cat==0.2.0', 'scikit-learn>=1.0'], + 'umap-learn': ['umap-learn','skrub', 'scikit-learn', 'scipy'], 'pygraphviz': ['pygraphviz'], # + apt-get graphviz, graphviz-dev + 'rapids': [ + "cudf-cu12==24.12.*", "dask-cudf-cu12==24.12.*", "cuml-cu12==24.12.*", + "cugraph-cu12==24.12.*", "nx-cugraph-cu12==24.12.*", + #"cuspatial-cu12==24.12.*", + #"cuproj-cu12==24.12.*", "cuxfilter-cu12==24.12.*", "cucim-cu12==24.12.*", + #"pylibraft-cu12==24.12.*", "raft-dask-cu12==24.12.*", "cuvs-cu12==24.12.*", + ], } # https://github.com/facebookresearch/faiss/issues/1589 for faiss-cpu 1.6.1, #'setuptools==67.4.0' removed -base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch<2', 'sentence-transformers', 'faiss-cpu', 'joblib'] +base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch', 'sentence-transformers', 'faiss-cpu', 'joblib'] base_extras = {**base_extras_light, **base_extras_heavy}