Skip to content

Commit

Permalink
Handle unseen values in MCA
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxHalford committed Nov 17, 2024
1 parent 0644d88 commit c6618d2
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 23 deletions.
6 changes: 4 additions & 2 deletions prince/mca.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@ def __init__(

def _prepare(self, X):
if self.one_hot:
# Create the one-hot encoder if it doesn't exist (usually because we're in the fit method)
X = pd.get_dummies(X, columns=X.columns)
X = pd.get_dummies(X, columns=X.columns, prefix_sep="__")
if (one_hot_columns_ := getattr(self, "one_hot_columns_", None)) is not None:
X = X.reindex(columns=one_hot_columns_.union(X.columns), fill_value=False)
return X

def get_feature_names_out(self, input_features=None):
Expand All @@ -62,6 +63,7 @@ def fit(self, X, y=None):

# One-hot encode the data
one_hot = self._prepare(X)
self.one_hot_columns_ = one_hot.columns

# We need the number of columns to apply the Greenacre correction
self.J_ = one_hot.shape[1]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "prince"
version = "0.13.1"
version = "0.14.0"
description = "Factor analysis in Python: PCA, CA, MCA, MFA, FAMD, GPA"
authors = ["Max Halford <maxhalford25@gmail.com>"]
license = "MIT"
Expand Down
92 changes: 72 additions & 20 deletions tests/test_mca.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import io
import tempfile

import numpy as np
Expand Down Expand Up @@ -64,7 +65,10 @@ def test_col_coords(self):
if self.sup_cols:
F = pd.concat((F, load_df_from_R("ca$quali.sup$coord")))
P = self.ca.column_coordinates(self.dataset)
np.testing.assert_allclose(F.abs(), P.abs())
# Prince adds a prefix to each column. We need to remove it in order to align the rows
# of the two dataframes
P.index = [idx.split("__", 1)[1] for idx in P.index]
np.testing.assert_allclose(F.abs(), P.abs().loc[F.index])
else:
super().test_col_coords()

Expand All @@ -74,7 +78,10 @@ def test_col_cos2(self):
if self.sup_cols:
F = pd.concat((F, load_df_from_R("ca$quali.sup$cos2")))
P = self.ca.column_cosine_similarities(self.dataset)
np.testing.assert_allclose(F, P)
# Prince adds a prefix to each column. We need to remove it in order to align the rows
# of the two dataframes
P.index = [idx.split("__", 1)[1] for idx in P.index]
np.testing.assert_allclose(F, P.loc[F.index])
else:
super().test_col_cos2()

Expand All @@ -89,23 +96,23 @@ def test_with_and_without_one_hot():
>>> mca = prince.MCA(n_components=2, one_hot=True, engine="scipy")
>>> mca = mca.fit(df)
>>> mca.transform(df).round(2).abs().sort_index(axis='columns')
0 1
0 2.0 0.00
1 0.5 0.65
2 0.5 0.65
3 0.5 0.65
4 0.5 1.94
0 1
0 0.00 2.0
1 0.65 0.5
2 0.65 0.5
3 0.65 0.5
4 1.94 0.5
>>> mca = prince.MCA(n_components=2, one_hot=False, engine="scipy")
>>> one_hot = pd.get_dummies(df, columns=['foo', 'bar'])
>>> mca = mca.fit(one_hot)
>>> mca.transform(one_hot).round(2).abs().sort_index(axis='columns')
0 1
0 2.0 0.00
1 0.5 0.65
2 0.5 0.65
3 0.5 0.65
4 0.5 1.94
0 1
0 0.00 2.0
1 0.65 0.5
2 0.65 0.5
3 0.65 0.5
4 1.94 0.5
"""

Expand All @@ -122,12 +129,12 @@ def test_issue_131():
>>> mca = prince.MCA(engine="scipy")
>>> mca = mca.fit(df)
>>> mca.transform(df).round(2).abs().sort_index(axis='columns')
0 1
0 2.0 0.00
1 0.5 0.65
2 0.5 0.65
3 0.5 0.65
4 0.5 1.94
0 1
0 0.00 2.0
1 0.65 0.5
2 0.65 0.5
3 0.65 0.5
4 1.94 0.5
>>> mca.K_, mca.J_
(2, 8)
Expand Down Expand Up @@ -185,3 +192,48 @@ def test_type_doesnt_matter():

for i in range(len(outputs) - 1):
np.testing.assert_allclose(outputs[i], outputs[i + 1])


issue_161_data = '''
,category,userid,location,applicationname,browser\n
0,Portal Login,a@b.com,"San Jose, CA, United States",A,Chrome\n
1,Application Access,b@b.com,"San Jose, CA, United States",B,Other\n
2,Application Access,a@b.com,"San Jose, CA, United States",C,Other\n
3,Portal Login,c@b.com,"San Diego, CA, United States",A,Chrome\n
'''

def test_issue_161():
"""
https://github.com/MaxHalford/prince/issues/161
>>> data = pd.read_csv(io.StringIO(issue_161_data), index_col=0)
>>> mca = prince.MCA(
... n_components=10,
... n_iter=3,
... copy=True,
... check_input=True,
... engine='sklearn',
... random_state=42,
... handle_unknown='ignore'
... )
>>> mca = mca.fit(data[:3])
>>> mca.eigenvalues_summary
eigenvalue % of variance % of variance (cumulative)
component
0 0.673 67.32% 67.32%
1 0.327 32.68% 100.00%
>>> mca.row_coordinates(data[:3])
0 1
0 1.120811 -0.209242
1 -0.820491 -0.571660
2 -0.300320 0.780902
>>> mca.transform(data[3:])
0 1
3 1.664888 -0.640285
"""

0 comments on commit c6618d2

Please sign in to comment.