From c24413b1546bfa73a37f8e35f97c2695bcba2fb1 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Jun 2023 17:56:06 -0400 Subject: [PATCH] Adding new simulations Signed-off-by: Adam Li --- docs/references.bib | 61 +- experiments/plotting_cmi_analysis.ipynb | 763 ++++++++++++++++++++++-- sktree/experimental/__init__.py | 3 +- sktree/experimental/forest.py | 67 +++ sktree/experimental/ksg.py | 268 ++++++++- sktree/experimental/meson.build | 1 + sktree/experimental/simulate.py | 108 +++- sktree/experimental/tests/meson.build | 1 + sktree/experimental/tests/test_ksg.py | 51 ++ sktree/neighbors.py | 118 +++- 10 files changed, 1322 insertions(+), 119 deletions(-) create mode 100644 sktree/experimental/forest.py create mode 100644 sktree/experimental/tests/test_ksg.py diff --git a/docs/references.bib b/docs/references.bib index 07c251309..b2139ffb9 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -19,12 +19,12 @@ @article{Li2019manifold } @article{perry2021random, - title={Random Forests for Adaptive Nearest Neighbor Estimation of Information-Theoretic Quantities}, - author={Ronan Perry and Ronak Mehta and Richard Guo and Eva Yezerets and Jesús Arroyo and Mike Powell and Hayden Helm and Cencheng Shen and Joshua T. Vogelstein}, - year={2021}, - eprint={1907.00325}, - journal={arXiv}, - primaryClass={cs.LG} + title = {Random Forests for Adaptive Nearest Neighbor Estimation of Information-Theoretic Quantities}, + author = {Ronan Perry and Ronak Mehta and Richard Guo and Eva Yezerets and Jesús Arroyo and Mike Powell and Hayden Helm and Cencheng Shen and Joshua T. Vogelstein}, + year = {2021}, + eprint = {1907.00325}, + journal = {arXiv}, + primaryclass = {cs.LG} } @inproceedings{Meghana2019_geodesicrf, @@ -57,26 +57,37 @@ @article{TomitaSPORF2020 } @article{Darbellay1999Entropy, - title={Estimation of the Information by an Adaptive Partitioning of the Observation Space}, - author={Georges A. Darbellay and Igor Vajda}, - journal={IEEE Trans. Inf. Theory}, - year={1999}, - volume={45}, - pages={1315-1321} + title = {Estimation of the Information by an Adaptive Partitioning of the Observation Space}, + author = {Georges A. Darbellay and Igor Vajda}, + journal = {IEEE Trans. Inf. Theory}, + year = {1999}, + volume = {45}, + pages = {1315-1321} +} + +@article{kozachenko1987sample, + title = {Sample estimate of the entropy of a random vector}, + author = {Kozachenko, Lyudmyla F and Leonenko, Nikolai N}, + journal = {Problemy Peredachi Informatsii}, + volume = {23}, + number = {2}, + pages = {9--16}, + year = {1987}, + publisher = {Russian Academy of Sciences, Branch of Informatics, Computer Equipment and~…} } @article{Kraskov_2004, - title = {Estimating mutual information}, - volume = {69}, - url = {https://link.aps.org/doi/10.1103/PhysRevE.69.066138}, - doi = {10.1103/PhysRevE.69.066138}, - number = {6}, - urldate = {2023-01-27}, - journal = {Physical Review E}, - author = {Kraskov, Alexander and Stögbauer, Harald and Grassberger, Peter}, - month = jun, - year = {2004}, - note = {Publisher: American Physical Society}, - pages = {066138}, - file = {APS Snapshot:/Users/adam2392/Zotero/storage/GRW23BYU/PhysRevE.69.html:text/html;Full Text PDF:/Users/adam2392/Zotero/storage/NJT9QCVA/Kraskov et al. - 2004 - Estimating mutual information.pdf:application/pdf} + title = {Estimating mutual information}, + volume = {69}, + url = {https://link.aps.org/doi/10.1103/PhysRevE.69.066138}, + doi = {10.1103/PhysRevE.69.066138}, + number = {6}, + urldate = {2023-01-27}, + journal = {Physical Review E}, + author = {Kraskov, Alexander and Stögbauer, Harald and Grassberger, Peter}, + month = jun, + year = {2004}, + note = {Publisher: American Physical Society}, + pages = {066138}, + file = {APS Snapshot:/Users/adam2392/Zotero/storage/GRW23BYU/PhysRevE.69.html:text/html;Full Text PDF:/Users/adam2392/Zotero/storage/NJT9QCVA/Kraskov et al. - 2004 - Estimating mutual information.pdf:application/pdf} } \ No newline at end of file diff --git a/experiments/plotting_cmi_analysis.ipynb b/experiments/plotting_cmi_analysis.ipynb index ad0f7cd8c..02cfd6400 100644 --- a/experiments/plotting_cmi_analysis.ipynb +++ b/experiments/plotting_cmi_analysis.ipynb @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "705fb134-54c2-42fd-bbc3-1d4541ae8769", "metadata": {}, "outputs": [], @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "id": "2e7b6950-44a0-40a9-bf2a-b6eca06725c1", "metadata": {}, "outputs": [], @@ -78,8 +78,10 @@ "import numpy as np\n", "import scipy\n", "import scipy.spatial\n", + "from tqdm import tqdm\n", "\n", "from sklearn.neighbors import NearestNeighbors\n", + "from sktree.neighbors import forest_distance\n", "import sktree\n", "from sktree.experimental.simulate import (\n", " simulate_helix,\n", @@ -87,6 +89,7 @@ " simulate_sphere,\n", " embed_high_dims,\n", ")\n", + "from sktree.experimental.ksg import _compute_radius_nbrs, mutual_info_ksg_nn\n", "from sktree.experimental.mutual_info import (\n", " entropy_gaussian,\n", " entropy_weibull,\n", @@ -96,12 +99,13 @@ " cmi_gaussian,\n", " mi_gamma,\n", ")\n", - "from sktree.experimental import mutual_info_ksg\n", + "from sktree.experimental import mutual_info_ksg, SupervisedInfoForest\n", "from sktree.tree import compute_forest_similarity_matrix\n", "from sktree import (\n", " UnsupervisedRandomForest,\n", " UnsupervisedObliqueRandomForest,\n", " ObliqueRandomForestClassifier,\n", + " ObliqueRandomForestRegressor,\n", " NearestNeighborsMetaEstimator,\n", ")\n", "\n", @@ -114,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "id": "bf387bc2-f1d1-4b7a-9dab-e9c8fb992b2a", "metadata": {}, "outputs": [], @@ -133,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "id": "e5089afb-c359-423d-92b1-641fca6315b2", "metadata": {}, "outputs": [], @@ -144,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 9, "id": "4fcde0a4-7413-4f09-bbd8-fe62fcd62c2d", "metadata": {}, "outputs": [], @@ -156,7 +160,7 @@ "\n", "# hyperparameters of the simulation\n", "n_samples = 1000\n", - "n_noise_dims = 1\n", + "n_noise_dims = 20\n", "alpha = 0.001\n", "\n", "# dimensionality of mvg\n", @@ -178,14 +182,14 @@ "id": "dc222a29-5a31-486e-b2e3-6e276a61f81f", "metadata": {}, "source": [ - "## Demonstrate a single simulation\n", + "## Setup a single simulation\n", "\n", "Now, to demonstrate what the data would look like fromm a single parameterized simulation, we want to show the entire workflow from data generation to analysis and output value." ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 10, "id": "3cc73c4e-78b6-48b3-83b3-091d365d8ada", "metadata": {}, "outputs": [], @@ -214,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "id": "aa1a67bb-55fa-4983-b2d1-a23102945b7c", "metadata": {}, "outputs": [ @@ -253,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "id": "a841ffc8-e821-4c0a-8df0-9b55b1be3aba", "metadata": {}, "outputs": [ @@ -282,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "id": "383a17a7-6c4f-49fc-aa4d-322f820ac18e", "metadata": {}, "outputs": [ @@ -304,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 14, "id": "570cf521-f62f-4e62-98e4-0aea8e4f72db", "metadata": {}, "outputs": [ @@ -337,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "id": "438211c0-228c-453d-aaec-02813988177f", "metadata": {}, "outputs": [ @@ -358,7 +362,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 298, "id": "ff5c92e5-65e6-4990-9287-708e4af32450", "metadata": {}, "outputs": [ @@ -366,7 +370,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(1000, 2) (1000, 3)\n" + "(1000, 21) (1000, 3)\n" ] } ], @@ -379,28 +383,54 @@ "print(X.shape, data.shape)" ] }, + { + "cell_type": "markdown", + "id": "2e7eb0d5-75f9-4934-af65-1985bb395b38", + "metadata": {}, + "source": [ + "### Unsupervised KSG approach" + ] + }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 330, "id": "cc2209c6-3562-44e2-95d9-8343e24b5e28", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1000, 21) (1000, 21) (1000, 42)\n" + ] + } + ], "source": [ + "# define the forest-based estimator to determine distances\n", "clf = UnsupervisedObliqueRandomForest(\n", - " n_estimators=10,\n", + " n_estimators=200,\n", " feature_combinations=feature_combinations,\n", " n_jobs=n_jobs,\n", " random_state=seed,\n", ")\n", - "# clf.fit(highdim_data)\n", + "\n", + "# meta-estimator for nearest-neighbor lookup\n", "est = NearestNeighborsMetaEstimator(\n", - " clf, n_neighbors=n_nbrs, algorithm=\"auto\", n_jobs=n_jobs\n", - ")" + " estimator=clf,\n", + " n_neighbors=n_nbrs,\n", + " algorithm=\"auto\",\n", + " n_jobs=n_jobs,\n", + " force_fit=True,\n", + " verbose=True,\n", + ")\n", + "\n", + "data = np.hstack((X, Y))\n", + "print(X.shape, Y.shape, data.shape)" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 320, "id": "aadc7bb0-40e5-4e5c-9e99-f810d570b19f", "metadata": {}, "outputs": [ @@ -420,7 +450,25 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 321, + "id": "d49a1ad6-55f3-42aa-b68c-26b009970ad7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1000, 21) (1000, 21) (1000, 42)\n" + ] + } + ], + "source": [ + "print(X.shape, Y.shape, data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 322, "id": "3bded813-31ea-44ba-bf77-973b3338f0fc", "metadata": {}, "outputs": [ @@ -443,7 +491,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 331, "id": "c051b1e1-4262-49c0-8383-74565b192a6b", "metadata": { "tags": [] @@ -453,21 +501,112 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data shape: (1000, 4)\n", - "X shape: (1000, 2), Y shape: (1000, 2)\n", - "Preprocessing complete.\n", - "Using 10 neighbors to define D-dimensional volume.\n" + "Finished fitting estimator: UnsupervisedObliqueRandomForest(feature_combinations=2.0, n_estimators=200,\n", + " n_jobs=-1, random_state=12345)\n", + "Finished computing distance matrix: (1000, 1000)\n", + "Finished fitting estimator: UnsupervisedObliqueRandomForest(feature_combinations=2.0, n_estimators=200,\n", + " n_jobs=-1, random_state=12345)\n", + "Finished computing distance matrix: (1000, 1000)\n", + "Finished computing distance matrix: (1000, 1000)\n", + "Computing radius neighbors for 1000 samples\n", + "Computing radius neighbors in parallel...\n", + "Finished fitting estimator: UnsupervisedObliqueRandomForest(feature_combinations=2.0, n_estimators=200,\n", + " n_jobs=-1, random_state=12345)\n", + "Finished computing distance matrix: (1000, 1000)\n", + "Finished computing distance matrix: (1000, 1000)\n", + "Computing radius neighbors for 1000 samples\n", + "Computing radius neighbors in parallel...\n" ] } ], "source": [ - "ksg_mi = mutual_info_ksg(X, Y, nn_estimator=est, n_jobs=n_jobs, k=10, verbose=True)" + "geo_ksg_mi = mutual_info_ksg(\n", + " X, Y, nn_estimator=est, n_jobs=n_jobs, k=0.2, verbose=False\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 329, "id": "162653a5-d2f9-4e17-91fa-04be2c8515d7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.6599834335427683\n" + ] + } + ], + "source": [ + "print(geo_ksg_mi)" + ] + }, + { + "cell_type": "code", + "execution_count": 332, + "id": "5a56b255-4f29-4ce7-a0ee-616063c10ad5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.478834083567829\n" + ] + } + ], + "source": [ + "ksg_mi = mutual_info_ksg_nn(X, Y, n_jobs=n_jobs, k=0.2, verbose=False)\n", + "print(ksg_mi)" + ] + }, + { + "cell_type": "code", + "execution_count": 300, + "id": "21a59855-6d48-490e-b746-231871281d9e", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/6_/sl83qtkd68x3_mvfys07_6qm0000gn/T/ipykernel_32459/95748408.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mn_neighbors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malgorithm\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"kd_tree\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m )\n\u001b[0;32m----> 4\u001b[0;31m ksg_mi = mutual_info_ksg(\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnn_estimator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnn_estimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m )\n", + "\u001b[0;32m~/Documents/scikit-tree/sktree/experimental/ksg.py\u001b[0m in \u001b[0;36mmutual_info_ksg\u001b[0;34m(X, Y, Z, k, norm, nn_estimator, n_jobs, transform, random_seed, verbose)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[0mval\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_cmi_ksg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mz_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnn_estimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mknn_here\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 199\u001b[0;31m \u001b[0mval\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_mi_ksg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnn_estimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mknn_here\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 200\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 201\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnorm\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'max'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Documents/scikit-tree/sktree/experimental/ksg.py\u001b[0m in \u001b[0;36m_mi_ksg\u001b[0;34m(data, x_idx, y_idx, nn_estimator, knn_here, verbose)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[0;31m# compute on the subspace of X\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 264\u001b[0;31m \u001b[0mnum_nn_x\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_compute_radius_nbrs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mradius_per_sample\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnn_estimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_idx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mx_idx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 265\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 266\u001b[0m \u001b[0;31m# compute on the subspace of Y\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Documents/scikit-tree/sktree/experimental/ksg.py\u001b[0m in \u001b[0;36m_compute_radius_nbrs\u001b[0;34m(data, radius_per_sample, nn_estimator, col_idx)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0mnn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mradius\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mradius_per_sample\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m nn_ = nn_estimator.radius_neighbors(\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0matleast_2d\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_idx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mradius\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mradius\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_distance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m )\n", + "\u001b[0;32m~/miniforge3/envs/sktree/lib/python3.9/site-packages/sklearn/neighbors/_base.py\u001b[0m in \u001b[0;36mradius_neighbors\u001b[0;34m(self, X, radius, return_distance, sort_results)\u001b[0m\n\u001b[1;32m 1146\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_check_precomputed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1147\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1148\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"csr\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"C\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1149\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mradius\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniforge3/envs/sktree/lib/python3.9/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m_validate_data\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Validation should be done on X, y or both.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 589\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 590\u001b[0;31m \u001b[0mdefault_check_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\"estimator\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 591\u001b[0m \u001b[0mcheck_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdefault_check_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "nn_estimator = NearestNeighbors(\n", + " n_neighbors=5, algorithm=\"kd_tree\", p=np.inf, n_jobs=n_jobs\n", + ")\n", + "ksg_mi = mutual_info_ksg(\n", + " X, Y, nn_estimator=nn_estimator, n_jobs=n_jobs, k=0.3, verbose=False\n", + ")\n", + "print(ksg_mi)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bbc83e4-a7d1-4639-82b8-f4efe621c7ec", "metadata": {}, "outputs": [], "source": [ @@ -476,41 +615,220 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 257, + "id": "5f2aea06-d2d1-4320-a3fd-3aecca36fb14", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 4\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 4 1 1 1 1 1 10 1 1 1 1 1 1 1 1 6 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 6 2 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 6 1 1 1 1 1 1 1 1 5 1\n", + " 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 5 1 1 1 7 1 1 1 1 1 4 1 7 1 1 1 1 1 1 1\n", + " 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1\n", + " 1 6 1 1 1 1 1 2 1 1 1 1 7 1 1 1 1 1 3 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 5 1 1 1 1 1 1 1\n", + " 1 1 12 9 1 1 1 1 5 1 1 1 1 1 1 1 1 1 7 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 1 1 1 1 1 1 1 1\n", + " 6 1 1 2 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 4 1 8 1 1 1 1 1 1 1 2 1 1 1 1 2 1 7 1 3 1\n", + " 1 1 1 1 1 1 1 1 1 2 1 2 4 1 1 1 1 1 1 3 9 4 1 3\n", + " 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1\n", + " 7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 2 1 1 1 1 1 1 1 12 1 1 1 1 5 1 5 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 6 1 1 1 1 2 1 1 1 1 1 20 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 9 1 1 1 9 1 1 1 1 1 1 1 2\n", + " 1 5 4 1 1 1 5 1 1 1 1 1 1 1 1 1 1 7 1 1 1 1 1 2\n", + " 1 1 1 1 7 1 1 1 9 1 1 5 1 1 1 1 5 1 4 2 1 1 1 1\n", + " 1 1 1 3 1 11 1 1 1 3 1 1 1 1 1 1 1 1 1 1 8 1 1 1\n", + " 1 3 3 14 1 1 1 2 1 1 1 1 1 3 1 1 1 1 1 1 1 5 1 1\n", + " 1 1 1 1 1 8 7 3 1 1 1 1 10 1 1 1 1 1 8 8 1 2 1 1\n", + " 1 1 1 3 11 1 1 1 9 7 1 1 6 1 1 1 1 5 4 6 1 1 1 1\n", + " 1 1 1 1 1 3 1 1 1 1 2 1 11 1 1 1 1 11 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 5 1 1 1 1 2 1 1 1 1 1 1\n", + " 1 8 1 1 11 8 1 1 1 1 1 1 1 1 9 5 1 1 1 1 1 1 3 2\n", + " 1 12 1 4 1 1 1 1 1 1 7 5 2 1 1 1 1 1 1 1 10 5 1 1\n", + " 11 1 1 6 1 1 1 9 1 1 1 1 1 1 1 8 1 1 1 1 1 1 3 7\n", + " 2 1 1 10 1 1 5 1 1 3 1 1 1 1 1 1 2 1 1 1 1 1 8 1\n", + " 1 3 5 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 1 1 8 1 1 1 1\n", + " 1 1 1 1 1 1 1 1 5 1 1 2 1 1 1 1 1 1 1 1 5 1 1 1\n", + " 1 1 1 13 4 1 1 1 11 1 1 1 1 1 1 1 1 1 1 4 2 2 1 1\n", + " 7 1 1 1 4 1 1 1 1 1 1 10 1 1 1 1 1 1 2 3 1 1 6 1\n", + " 1 1 1 1 8 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 5 1 1 1\n", + " 1 1 1 1 1 1 1 1 14 9 20 1 1 1 1 1 1 1 1 1 9 2 1 1\n", + " 7 1 1 1 8 1 1 2 1 1 1 1 8 1 1 12]\n" + ] + } + ], + "source": [ + "nn_estimator.fit(data[:, col_idx])\n", + "\n", + "nn = []\n", + "for idx, radius in enumerate(radius_per_sample):\n", + " nn_ = nn_estimator.radius_neighbors(\n", + " X=np.atleast_2d(data[:, col_idx]), radius=radius, return_distance=False\n", + " )\n", + " nn.append(nn_[idx])\n", + "print(np.array([len(nn_) for nn_ in nn]))" + ] + }, + { + "cell_type": "code", + "execution_count": 256, + "id": "daabc070-9fdf-4896-adb7-f18c16d81adc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[array([2])]\n" + ] + } + ], + "source": [ + "print(nn[])" + ] + }, + { + "cell_type": "code", + "execution_count": 301, "id": "ecc4b670-3099-457b-b783-3e0aab245e6c", "metadata": {}, "outputs": [], "source": [ - "cmi_est = CMITest(k=0.2, n_jobs=n_jobs, random_seed=seed)" + "cmi_est = CMITest(k=0.3, n_jobs=n_jobs, random_seed=seed)" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 302, "id": "4b5ce049-3722-4586-ae05-f2b6273fcead", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6522836124816846" + "1.1180955699940993" ] }, - "execution_count": 71, + "execution_count": 302, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_x = pd.DataFrame(X, columns=[f'x{idx}' for idx in range(X.shape[1])])\n", + "df_x = pd.DataFrame(X, columns=[f\"x{idx}\" for idx in range(X.shape[1])])\n", "x_cols = df_x.columns\n", - "df_y = pd.DataFrame(Y, columns=[f'y{idx}' for idx in range(Y.shape[1])])\n", + "df_y = pd.DataFrame(Y, columns=[f\"y{idx}\" for idx in range(Y.shape[1])])\n", "y_cols = df_y.columns\n", "df = pd.concat((df_x, df_y), axis=1)\n", "\n", "cmi_est._compute_cmi(df, x_cols, y_cols, set())" ] }, + { + "cell_type": "markdown", + "id": "9dae4346-5af2-4435-ad9d-a7dab1681d22", + "metadata": {}, + "source": [ + "### Supervised entropy estimate approach\n", + "\n", + "This is tabled until we get \"proba\" estimates for regression trees." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "77cc2540-819a-4fdd-b426-29c377490195", + "metadata": {}, + "outputs": [], + "source": [ + "estimator = ObliqueRandomForestRegressor(n_estimators=n_estimators, n_jobs=n_jobs)\n", + "sup_cmi_est = SupervisedInfoForest(estimator=estimator, y_categorical=False, n_jobs=-1)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "836b2183-9d0e-4454-ae8b-733b3419a659", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/adam2392/miniforge3/envs/sktree/lib/python3.9/site-packages/sklearn/utils/validation.py:1189: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1000, 2) (1000,)\n" + ] + }, + { + "data": { + "text/html": [ + "
SupervisedInfoForest(estimator=ObliqueRandomForestRegressor(n_jobs=-1),\n",
+       "                     n_jobs=-1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "SupervisedInfoForest(estimator=ObliqueRandomForestRegressor(n_jobs=-1),\n", + " n_jobs=-1)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = Y[:, [0]]\n", + "\n", + "sup_cmi_est.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "ea70bc36-0bb5-4f96-bcd4-23763c88bb86", + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'ObliqueRandomForestRegressor' object has no attribute 'predict_proba'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/6_/sl83qtkd68x3_mvfys07_6qm0000gn/T/ipykernel_32459/3154728669.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msup_cmi_est\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_cmi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/Documents/scikit-tree/sktree/experimental/forest.py\u001b[0m in \u001b[0;36mpredict_cmi\u001b[0;34m(self, X, Z)\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0;31m# compute the entropy of H(Y | X, Z)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 54\u001b[0;31m \u001b[0mH_yxz\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimator_yxz_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict_proba\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXZ\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 55\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;31m# compute the entropy of H(Y | Z)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'ObliqueRandomForestRegressor' object has no attribute 'predict_proba'" + ] + } + ], + "source": [ + "print(sup_cmi_est.predict_cmi(X))" + ] + }, { "cell_type": "markdown", "id": "67330326-469a-48de-9107-7d5cfdcfd4b7", @@ -523,9 +841,376 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "0a875b34-9a97-4bdf-ab45-14a1be6e60a5", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 5. 10. 15. 20. 25. 30. 35. 40. 45. 50. 55. 60. 65.\n", + " 70. 75. 80. 85. 90. 95. 100.]\n" + ] + } + ], + "source": [ + "# fix the number of samples, but increase the number of noise dimensions\n", + "n_samples = 1000\n", + "\n", + "n_jobs = -1\n", + "n_estimators = 300\n", + "feature_combinations = 1.5\n", + "k_ratio = 0.2\n", + "\n", + "# semi-parametric\n", + "n_nbrs = 0.2\n", + "\n", + "# hyperparameters of the simulation\n", + "alpha = 0.01\n", + "\n", + "# dimensionality of mvg\n", + "d = 3\n", + "\n", + "n_repeats = 10\n", + "\n", + "noise_dims_grid = np.linspace(0, 100, 21)\n", + "print(noise_dims_grid)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a21a0542-2557-4863-935a-697892700973", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.8879202209746868\n", + "0.9849302324167128\n" + ] + } + ], + "source": [ + "X = embed_high_dims(data[:, [0]], n_dims=n_noise_dims, random_state=seed)\n", + "Y = embed_high_dims(data[:, [1]], n_dims=n_noise_dims, random_state=seed)\n", + "Z = embed_high_dims(data[:, [2]], n_dims=n_noise_dims, random_state=seed)\n", + "highdim_data = np.hstack((X, Y, Z))\n", + "\n", + "# true MI and CMI\n", + "x_cov_idx = 0\n", + "y_cov_idx = 1\n", + "z_cov_idx = 2\n", + "true_mi = mi_gaussian(cov, y_cov_idx, z_cov_idx)\n", + "true_cmi = cmi_gaussian(cov, y_cov_idx, z_cov_idx, x_cov_idx)\n", + "print(true_mi)\n", + "print(true_cmi)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e95a90ef-2a18-4de3-a0ad-af45a842cad1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/21 [00:00" + ] + }, + "execution_count": 345, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGxCAYAAACeKZf2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAA64ElEQVR4nO3deXxU1f3/8fdkkkwSkkkIyySBsKkIyBZWIy61xiJSLGqVr1pB6lJbrEu+31ZRAWmrsd/WrZWWr7RutQjqT9AqizRK1RpBAqEgiyBLAmQSwjKThWwz5/dHcGAkhASSTHJ5PR+PeQxz5tyZz5yQmXfOPfeOzRhjBAAAYBFhoS4AAACgORFuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApYSHuoDW5vf7tW/fPsXFxclms4W6HAAA0AjGGJWWliolJUVhYQ3PzZx14Wbfvn1KTU0NdRkAAOA0FBQUqHv37g32OevCTVxcnKS6wXE6nSGuBgAANIbX61Vqamrgc7whZ124+WZXlNPpJNwAANDONGZJCQuKAQCApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApZx1X5zZUspryrXt0LYmbdOYL/86HcaYFnnc9sKo4dfflPE5/mdk08l/Xifr19A29dX57bbW+Fk29v/hqWppqPZT/Uy+GadTjfc399c3xvW9jpPV0NhxDtrmZI91itfZ2Bq+rd7X/622xvzsgsbqZP9PGznuJ7v/ZONxqvtPtd3JnOp1N/Z39Uz7NrStdPKf8Unbm/A7Vt84NuZ37mTPcbLXWu/vXD1t326PCY/RuR3PbeDVtCzCTTPZfni7bl16a6jLAAAg5IZ0GaLXrn4tZM9PuGkmkWGR6h7bvcUe38ic8q+E47WFWaGWrrmlx6Ohv4JO5y+k+sajqX95N2W7xtR0yv6NrPl0+hz/HPXVV197Y8b62zWfbKwaqrkpsxqn3L4RtTRFY2aZ6ms73dmpxs7Aneov+8aMY2PG50xmMxs749Zcv99S87yv1Td2zTnL+e3fr4Z+9if0PcnvZpfoLvW9vFZDuGkm/Tv119Lrl4a6DAAAznosKAYAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJYS0nDz8ccfa8KECUpJSZHNZtPixYsbve2///1vhYeHa+jQoS1WHwAAaH9CGm7Ky8s1ZMgQzZkzp0nbHT58WJMnT9YVV1zRQpUBAID2KqTfCj5u3DiNGzeuydvdfffduvnmm2W325s02wMAAKyv3a25eemll7Rjxw7NmjUr1KUAAIA2KKQzN021bds2PfTQQ/rkk08UHt640quqqlRVVRW47fV6W6o8AADQBrSbmRufz6ebb75Zs2fPVt++fRu9XVZWluLj4wOX1NTUFqwSAACEms0YY0JdhCTZbDYtWrRIEydOrPf+w4cPq2PHjrLb7YE2v98vY4zsdrs++OADffe73z1hu/pmblJTU+XxeOR0Opv9dQAAgObn9XoVHx/fqM/vdrNbyul0asOGDUFtf/rTn/Thhx/qrbfeUu/evevdzuFwyOFwtEaJAACgDQhpuCkrK9P27dsDt3fu3Km8vDwlJiaqR48emj59uvbu3atXX31VYWFhGjhwYND2Xbt2VVRU1AntAADg7BXScLNmzRpdfvnlgduZmZmSpClTpujll19WYWGh8vPzQ1UeAABoh9rMmpvW0pR9dgAAoG1oyud3uzlaCgAAoDEINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFIINwAAwFJCGm4+/vhjTZgwQSkpKbLZbFq8eHGD/d9++21deeWV6tKli5xOp9LT07V8+fLWKRYAALQLIQ035eXlGjJkiObMmdOo/h9//LGuvPJKLVmyRLm5ubr88ss1YcIErVu3roUrBQAA7YXNGGNCXYQk2Ww2LVq0SBMnTmzSdhdccIEmTZqkmTNnNqq/1+tVfHy8PB6PnE7naVQKAABaW1M+v8NbqaYW4ff7VVpaqsTExJP2qaqqUlVVVeC21+ttjdIAAECItOsFxb///e9VVlamG2+88aR9srKyFB8fH7ikpqa2YoUAAKC1tdtwM3/+fM2ePVtvvPGGunbtetJ+06dPl8fjCVwKCgpasUoAANDa2uVuqQULFuiOO+7Qm2++qYyMjAb7OhwOORyOVqoMAACEWrubuXn99dc1depUvf766xo/fnyoywEAAG1MSGduysrKtH379sDtnTt3Ki8vT4mJierRo4emT5+uvXv36tVXX5VUtytqypQpeu655zR69Gi53W5JUnR0tOLj40PyGgAAQNsS0pmbNWvWKC0tTWlpaZKkzMxMpaWlBQ7rLiwsVH5+fqD/Cy+8oNraWk2bNk3JycmBy3333ReS+gEAQNvTZs5z01o4zw0AAO1PUz6/292aGwAAgIYQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKUQbgAAgKWENNx8/PHHmjBhglJSUmSz2bR48eJTbrNy5UoNGzZMDodD5557rl5++eUWrxMAALQfIQ035eXlGjJkiObMmdOo/jt37tT48eN1+eWXKy8vT/fff7/uuOMOLV++vIUrBQAA7UV4KJ983LhxGjduXKP7z507V71799ZTTz0lSerfv78+/fRTPfPMMxo7dmxLlQkAANqRdrXmJicnRxkZGUFtY8eOVU5OTogqAgAAbU1IZ26ayu12y+VyBbW5XC55vV4dOXJE0dHRJ2xTVVWlqqqqwG2v19vidQIAgNBpVzM3pyMrK0vx8fGBS2pqaqhLAgAALahdhZukpCQVFRUFtRUVFcnpdNY7ayNJ06dPl8fjCVwKCgpao1QAABAi7Wq3VHp6upYsWRLUtmLFCqWnp590G4fDIYfD0dKlAQCANiKkMzdlZWXKy8tTXl6epLpDvfPy8pSfny+pbtZl8uTJgf533323duzYoV/+8pfasmWL/vSnP+mNN97QAw88EIryAQBAGxTScLNmzRqlpaUpLS1NkpSZmam0tDTNnDlTklRYWBgIOpLUu3dvvf/++1qxYoWGDBmip556Sn/5y184DBwAAATYjDEm1EW0Jq/Xq/j4eHk8HjmdzlCXAwAAGqEpn9/takExAADAqRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApRBuAACApYQ83MyZM0e9evVSVFSURo8erdWrVzfY/9lnn9X555+v6Ohopaam6oEHHlBlZWUrVQsAANq6kIabhQsXKjMzU7NmzdLatWs1ZMgQjR07VsXFxfX2nz9/vh566CHNmjVLmzdv1l//+lctXLhQDz/8cCtXDgAA2qqQhpunn35ad955p6ZOnaoBAwZo7ty5iomJ0Ysvvlhv/88++0xjxozRzTffrF69eul73/uebrrpplPO9gAAgLNHyMJNdXW1cnNzlZGRcayYsDBlZGQoJyen3m0uuugi5ebmBsLMjh07tGTJEl199dUnfZ6qqip5vd6gCwAAsK7wUD1xSUmJfD6fXC5XULvL5dKWLVvq3ebmm29WSUmJLr74YhljVFtbq7vvvrvB3VJZWVmaPXt2s9YOAADarpAvKG6KlStX6oknntCf/vQnrV27Vm+//bbef/99/frXvz7pNtOnT5fH4wlcCgoKWrFiAADQ2kI2c9O5c2fZ7XYVFRUFtRcVFSkpKanebWbMmKFbb71Vd9xxhyRp0KBBKi8v11133aVHHnlEYWEnZjWHwyGHw9H8LwAAALRJIZu5iYyM1PDhw5WdnR1o8/v9ys7OVnp6er3bVFRUnBBg7Ha7JMkY03LFAgCAdiNkMzeSlJmZqSlTpmjEiBEaNWqUnn32WZWXl2vq1KmSpMmTJ6tbt27KysqSJE2YMEFPP/200tLSNHr0aG3fvl0zZszQhAkTAiEHAACc3UIabiZNmqT9+/dr5syZcrvdGjp0qJYtWxZYZJyfnx80U/Poo4/KZrPp0Ucf1d69e9WlSxdNmDBBjz/+eKheAgAAaGNs5izbn+P1ehUfHy+PxyOn0xnqcgAAQCM05fO7XR0tBQAAcCqEGwAAYCmEGwAAYCmNDjeJiYkqKSmRJP34xz9WaWlpixUFAABwuhodbqqrqwPfy/TKK6+osrKyxYoCAAA4XY0+FDw9PV0TJ07U8OHDZYzRvffeq+jo6Hr7nuxbvQEAAFpao8PNa6+9pmeeeUZff/21JMnj8TB7AwAA2pzTOs9N7969tWbNGnXq1KklampRnOcGAID2pymf36d1huLbbrtNc+bMqfc+m82mGTNmnM7DAgAAnLHTCjeLFy8Oul1TU6OdO3cqPDxc55xzDuEGAACEzGmFm3Xr1p3Q5vV6ddttt+naa68946IAAABOV7OdxM/pdGr27NnM2gAAgJBq1jMUezweeTye5nxIAACAJjmt3VJ/+MMfgm4bY1RYWKi//e1vGjduXLMUBgAAcDpOK9w888wzQbfDwsLUpUsXTZkyRdOnT2+WwgAAAE7HaYWbnTt3NncdAAAAzYJvBQcAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuAEAAJZCuGkH5q/K15VP/0t5BYdDXQoAAG0e4aaN+3KfR7Pe3ahtxWX62Wu5OlxRHeqSAABo0wg3bVhljU+ZC9erxmckSfs8lfrvN9bLGBPiygAAaLsIN23YMyu+0taiUnWOjdSrPx6lSHuYsrcU6y+f8MWlAACcDOGmjVq144Be+GSHJCnrusG6tG8XzZgwQJL022VbtDb/UCjLAwCgzSLctEFlVbX67zfXyxjpxhHddeUAlyTpR6N7aPzgZNX6jX4+fx3rbwAAqAfhpg369T82ac+hI+reMVozvj8g0G6z2fTkdYPUs1OM9h4+ov95k/U3AAB8G+GmjVmxqUgL1xTIZpOeumGI4qIigu6Pi4rQnJuHKdIepn9uLtZfP2X9DQAAxyPctCEHyqo0/e3/SJLuvKSPRvfpVG+/gd3iNeP7/SVJTy5l/Q0AAMcLebiZM2eOevXqpaioKI0ePVqrV69usP/hw4c1bdo0JScny+FwqG/fvlqyZEkrVdtyjDF6eNEGlZRV63xXnDKv7Ntg/x9d2FPjB7H+BgCAbwtpuFm4cKEyMzM1a9YsrV27VkOGDNHYsWNVXFxcb//q6mpdeeWV2rVrl9566y1t3bpV8+bNU7du3Vq58ub39tq9Wv5lkSLsNj09aYiiIuwN9rfZbMq6/vj1N/9h/Q0AAApxuHn66ad15513aurUqRowYIDmzp2rmJgYvfjii/X2f/HFF3Xw4EEtXrxYY8aMUa9evXTZZZdpyJAhrVx589p7+Igee/dLSdL9GX11QUp8o7ZzBq2/KWL9DQAACmG4qa6uVm5urjIyMo4VExamjIwM5eTk1LvNu+++q/T0dE2bNk0ul0sDBw7UE088IZ/P11plNzu/3+h/3liv0qpaDeuRoJ9c2qdJ2w/sFq9Hj66/+e2yLXz/FADgrBeycFNSUiKfzyeXyxXU7nK55Ha7691mx44deuutt+Tz+bRkyRLNmDFDTz31lH7zm9+c9Hmqqqrk9XqDLm3JS5/tUs6OA4qOsOvpG4cq3N70H8mtF/bU1YOSVOMzmvb3tfJU1LRApQAAtA8hX1DcFH6/X127dtULL7yg4cOHa9KkSXrkkUc0d+7ck26TlZWl+Pj4wCU1NbUVK27YtqJS/XbZFknSI+P7q1fnDqf1ODabTU9eP1g9Eo+uv3mL898AAM5eIQs3nTt3lt1uV1FRUVB7UVGRkpKS6t0mOTlZffv2ld1+bLFt//795Xa7VV1d/9FC06dPl8fjCVwKCgqa70Wcgepavx54I0/VtX595/wuumV0jzN6vOPX36zYVKQX/72reQoFAKCdCVm4iYyM1PDhw5WdnR1o8/v9ys7OVnp6er3bjBkzRtu3b5ff7w+0ffXVV0pOTlZkZGS92zgcDjmdzqBLW/D8h9u0ca9XCTER+t/rB8tms53xYw7qHq9Hxn9z/pvNrL8BAJyVQrpbKjMzU/PmzdMrr7yizZs366c//anKy8s1depUSdLkyZM1ffr0QP+f/vSnOnjwoO677z599dVXev/99/XEE09o2rRpoXoJp2Vd/iHNWfm1JOk3EweqqzOq2R57cnpPjRtYt/7mnvmsvwEAnH3CQ/nkkyZN0v79+zVz5ky53W4NHTpUy5YtCywyzs/PV1jYsfyVmpqq5cuX64EHHtDgwYPVrVs33XfffXrwwQdD9RKa7Ei1T5lvrJfPb/SDoSn6/uCUZn18m82m3/5wsL7c51X+wQr94q31+r9bhzfLzBAAAO2BzZxlK0+9Xq/i4+Pl8XhCsotq5jsb9WrObiU5o7T8/ksVHxNx6o1Ow4Y9Hl3/589U7fNr5vcH6McX926R5wEAoDU05fO7XR0t1d59/NV+vZqzW5L0uxsGt1iwkYLX32Qt3az1rL8BAJwlCDetxFNRo1+8tV6SNCW9py45r0uLP+fk9J666oKj57+Zv1aeI6y/AQBYH+Gmlcx4Z6OKvFXq07mDHhrXv1We85v1N6mJ0dpz6Ih+yflvgCYpq6rVl/s8WrqhUH/5ZIc27PGEuiQAjRDSBcVni3+s36d31++TPcympycNVXRkw1+K2Zzio+vOf3P9nz/T8i+L9PJnuzR1DOtvAEkyxuhgebV2HahQ/sFy7T5QcfRSrvyDFSopCz5/lj3Mphnj+2vKRb1YpA+0YYSbFlbkrdSjizdKkqZdfq6Gpia0eg2Duyfokav767F/bNITSzZreM+OGty99esAQsHvNyr0Vmp3Sbl2H6wLL/kHy7WrpEL5BytUVlXb4PaJHSLVIzFGkfYwrd51UI/9Y5O2FpVq9jUDFRnO5DfQFhFuWpAxRr946z/yHKnRoG7x+vl3zw1ZLVMu6qXPdxzUsi/dmjZ/rd77+SWKj265Bc3tlTGGv8jbsa+KSvXZ9pKjMzEV2nWgXHsOHlG1z3/SbWw2KdkZpR6dYtSrUwf16BSjnokd1LNTjHp0ipEzqu73xBijv3yyU1lLN+v11QX6urhcf/7RMHWKdbTWywPQSBwK3oJe+3y3Hl28UY7wML1/78U6t2tciz7fqXiO1Gj8Hz7RnkNHdNUFSfrzj4bxQS7pQFmVPthUpKUb3fr86wOKsNvkjI5QfHSEnFERx/4dHR5oq7t9YntMpJ0xbWX7S6v07vp9envtHn25r/4vxo2w25TaMeZocIlRz0514aVnpxh17xijqIjG7yr+aGux7p2/TqVVteqWEK15k0doQErbOPM5YGVN+fwm3LSQnSXluvq5T3SkxtemzjOzvuCwfjj3M9X4jB6bMEC3naXrb4q9lVr+pVtLNri1aucB+ZvptyA87PhgFC7ncSEoMSZSrvgoJTmjlBwfJZczSp06RCosjDDUVJU1Pq3YVKS31+7Rx9tK5Dv6A4yw2zTm3M463xUXCDA9EmOUkhAtezOO8/biUt3xyhrtOlChmEi7nr5xqK4aWP934gEN8fuNSqtq5T1SI8+RmmPXlXXXFdU+jeqdqPQ+nc76P5wINw1ojXBT6/Prhv/L0br8w7ronE567fbRbeoD7KV/79Tsf2xSpD1Mb/00/axZf7P38BEt2+jWso2FWrP7kI7/nz+wm1PjBiYro79LjvCwoDcX75HaoNvfvAF5j9TIW1kbuF17Ggkpwm5T17goJR0NPd9cu+LrAlCSM0pdnQ45wltuEbrPb1RWVauyqlqVV9WqtLLu32WVtYqw25TWo6O6xIV+14vfb/TFroN6e+1eLdlQqNLj1soMTU3Q9cO66fuDU9SxQ/3fM9fcPBU1mjZ/rT7dXiJJyryyr37+3XPP+g+gs5Hfb1RSXhX0XnFCWDl6X/B7S41Kq2rVmE/h87rG6tb0nro2rZvios7OJQWEmwa0RriZ89F2/W75VsU5wrXsgUvVLSG6RZ7ndBljdPdruVr+ZZFSE6OD1t8YY1RV61dFtU/lVbV119W1qqg6el1dq/IqX/B1tU8VVUevj2uv9Rn16RKrASlODUh26oIUp7p3jG7VN//dB8q1dKNbSze6TziRYVqPBI0bmKRxA5OVmhhzRs9jjFFFte/EQHTcm1lJWZXcnioVeSvl9laqpKyqUW9qUt2i1m/Cj8t5LPi44qPUNc4hn98cCyVVNSqr8qms8ui/K2tVejSsBALMcbcrqn2nfP5enWI0vGeiRvbqqBG9OuqcLrGt9nPcWVKuRWv36O11e7Xn0JFAe7eEaF03rJuuTeumPl1iW6WWb6v1+fWb9zfr5c92SZLGD0rW724YrJhIljOeDXYfKNcbawr05po9Ki6tOqPHiooIU/xxu8O/2fVtjNEHm4oCv6cdIu26blh33ZreU31doV3q0NoINw1o6XCzca9HE+f8W7V+o6dvHKLrhnVv9udoDp6KGo3/Y936m86xDoWH2Y6GF19gir8lxDnC1T/ZGQg8A1KcOs8V26wzE9uLy7RsY6GWbHBrU+GxNRg2mzSyZ6LGDUrS2AuSlBLi0Fnj86u4tEpuT2Vd4PHUhZ5vX1fXnnwxbHOKsNsUFxWhWEe4OjjCFecIl+dIjb4qLj0hhHWMidDwnh0DgWdQ9/hm/RkerqjWP/5TqLfX7tG6/MOB9lhHuMYPSta1w7ppVK/ENjMjumB1vma8s1E1PqMLUpx6YfKINvdHDZpHZY1Pyza6tfCLAuXsOBBot9l0dI1e8Nq8oPV5R3dVB7fVbdPQ74+3skZv5+7R3z7fra/3lwfaR/dO1OT0XvreBS5F2K1/5B7hpgEtGW4qa3ya8MdPta24rF0s2M0rOKwb5+ac9EiSqIgwdYgMV4zDXncdaVcHx9Hro+0xkcG3j+8n1R29smmfV5sKvdpWVFbvc4WH2XRu19i60HM08PRPdiqxkbsXjDHaWlSqJRvqdjl9VVQWuM8eZtOFfRJ11cBkjb3Apa5xzfcN7K3BGKPDFTUnhJ4ib6UKj14Xl1Yp0h6mDg67YqMiFOcIV6wjXLFRR6+P+3fccW0djr8ddfI3V8+RGq3NP6TcXYf0xa6DWr/nsCprgn+OkfYwDe4er+G9Ompkz0QN79mxybuHqmv9+mhrsRat3avsLUWq8dW9NdnDbLrkvM66blh3Xdnf1arniWqKL3Yd1N1/y9WB8mp1jo3U/906XMN7Joa6LDSTTfu8WvhFvhat2ytvZd0uUZtNuvS8Lpo0MlUZ/V0tfmoAY4xyvj6gV3N2a8XmosAfoi6nQzeN6qGbR/VQV2f7eo9rCsJNA1oy3Dz+/ibN+2SnOsc69MEDlzb6wzmUdh8oV5G3KhBIOkTaFR1ZF1qacwGmVDdT8fX+srqwczTwbCr06nBF/V8LkRwfpQHJzqCZnh6JMQoLs8kYo417vVq6sVBLN7q1s+TYXzPfLCodNzBJVw5Iahc/h/akutavL/d5lLv7kNbsOqQ1uw+ecLI7STq3a6xG9Oyo4T07amSvRPXsFHNC2DfGKK/gsBat26t/rN+nQ8f9XxiQ7NR1w7rpmqEp7SaU7jlUoTtfzdXmQq8i7WF6/NqBumFEaqjLwmnyVtbo3bx9WvhFgTbsPXZ26m4J0bphRHfdMCI1ZDN0hZ4jmr8qX6+vLlBJWd0usfAwm8YOTNLkC3tqVO/EkP1xXVJWpfKqWvXs1KFZH5dw04CWCjef7zigm+Z9LmOkv04ZoSv6u5rtsa3MGKNCT6U2FwYHnt0HKurt3yHSrn7JThV5K4PWX0SGh+nS87ro6kFJuqK/i3P4tCJjjHYfqNAXuw7WBZ7dh7S9uOyEfp1jHRrRs27NzsBu8VpzdHHwjuOCadc4h65N66Zrh3VTv6T2eXh1eVWt/vuN9Vr2pVuSdPvFvTV9XD+FnwW7DazAGKMvdh3Sgi/ytWRDYWCWMsJu0/cGJGnSyFSNObdzs//xd7qqa/1a9qVbf8vZpS92HQq0n++K04+OLkCOdbTMGrDKGp+2FZVps9urre5SbXWXaovbq5Kyan3n/C56eeqoZn0+wk0DWircbHF7df+CPA1NTdCT1w9utsc9W5VW1miLuzQo9GxxlwatP4mOsOvyfl101cBkfbdf1xb7BUbTHSqvVu7uQ/pi90Hl7jqk/+zxnHT3Z3SEXWMvcOm6Yd3b1IfGmfD7jZ7L3qbnsrdJki7t20V/vCmN0N2G7S+t0v9bu0dvfFEQFLjP6xqrSSNTdW1atzZ/wsZN+7z62+e7tXjdXh2pqVuAHOsI1/XDuunW9J6nfa41v9+o4FCFtrhLtaWwVFuL6t6Pd5WU13sajW/WN75xd/qZvJwTEG4a0JK7papq6xbjcqREy6j1+bWjpFybC72KjrDrkvO6tNn1FwhWWePTxr0erdl9SGt2HdTGvV716dJB1w3rrqsGJlk2mC7ZUKj/fmO9jtT41KdzB82bMkLnhOjIruZiTN15WUpKq1RV61d1rV81vrrr6uOuj7WZuuvj+tX4/HXb+vyq+Vb/qlq/oiPsdeeCCpwSIbruOj6qSSdcPJVan18fb9uvBasL9OGW4sDpHGIi7ZowOEWTRqUqLTWhTa+drI/nSI3+X+4evfb57qCgdtE5nTQ5vacy+rtOOpN4qLxaW9yl2uquCzBb3KX6qqj0pEdVJnaI1PmuOPVLjlO/pDidn+RUX1dsi3wOEm4a0JpnKAaAL/d5dOcra7TPU6m4qHA9f/MwXda3S6jLqldljU/F3iq5vXWL1Y9dqoL+/c2sQCgkxEQEToSZdFzwCQSh+CjFOcIbDCT5Byr0xpoCvZW7R25vZaA9rUeC/mtkqsYPTrFE4Pb7jT77+oBezdmlf24uCsyyJDmjdPPoHrrkvM7aWVKure5SbT4aaIq89R/SHhkepvO6xur8pDj1T3Lq/KS6QNMl1tFq4Y9w0wDCDYDWtr+0Sne/lqvc3YcUZpMevrq/br+4d6t9KNT6/Copqw6cY6n4aEj5JsR8E2g8R+pf3F+fOEe4oiLtirSHKTI8TBF229HrsEDbsfuOXTu+3Tc8uH+EPUwV1bVyHz0i0H30UuipbHSo6hBpD5r1SYp3KCk+WpF2m95dv0//3n7sEO6OMRG6blh3TRqZaunzxuw9fETzV+3WgtUFOlB+4gEAx0tNjNb5Lqf6JR2bkenVqUPI140RbhpAuAEQClW1Ps1YvFFvrNkjSfrh8O56/NqBzXJ+oMoan/YePqI9h46o4GCFCg5VaM/BI9pzqEKFnroTRjb29FWO8LDAySJdzii54hxKio9SV+fRE0c6HeoaF9Xqu4SNMfJW1h53SoQjx8KP91gAakxAs9mki8/trEkjU3XlAFeLngG8ramq9WnpBrde+3y3dpaU65yusUd3J8Wp39EZmbY6a0W4aQDhBkCoGGP00r936Tfvb5LfSMN7dtTcHw0/5ddb1Pr8KvRUHgsugRBTF2BOtivhePYwm7rGOdT1uMASCDBOR+DfzqiGd+m0dUeqfUdnfY6cGHwqanThOZ10w/DuZ3xWcrQ+wk0DCDcAQu3jr/brnvlr5a2sVXJ8lF64dYQ6x0UeCy1HZ10KDtX92+2tPOWZw2Mi7UrtGKPUxGh17xij7h2jlZoYo5T4aLniHerUwWGJI9Fw9iLcNIBwA6At2LG/THe8ukY7jjudfkMi7WHq3jFa3RNjlNqxLsCkJkYr9WiQSewQ2a5nXIBTacrnd9vcsQYAFtenS6wW/WyMHliYpw+3FMseZlNKQpS6JxwXWo5epybGqEuso818lxbQ1hFuACBE4qMj9OJtI3WovFpxUeEhPxoFsArCDQCEWFO/ZBRAw/gzAQAAWArhBgAAWArhBgAAWArhBgAAWArhBgAAWArhBgAAWArhBgAAWArhBgAAWArhBgAAWArhBgAAWEqbCDdz5sxRr169FBUVpdGjR2v16tWN2m7BggWy2WyaOHFiyxYIAADajZCHm4ULFyozM1OzZs3S2rVrNWTIEI0dO1bFxcUNbrdr1y79z//8jy655JJWqhQAALQHIQ83Tz/9tO68805NnTpVAwYM0Ny5cxUTE6MXX3zxpNv4fD7dcsstmj17tvr06dOK1QIAgLYupOGmurpaubm5ysjICLSFhYUpIyNDOTk5J93uV7/6lbp27arbb7/9lM9RVVUlr9cbdAEAANYV0nBTUlIin88nl8sV1O5yueR2u+vd5tNPP9Vf//pXzZs3r1HPkZWVpfj4+MAlNTX1jOsGAABtV8h3SzVFaWmpbr31Vs2bN0+dO3du1DbTp0+Xx+MJXAoKClq4SgAAEErhoXzyzp07y263q6ioKKi9qKhISUlJJ/T/+uuvtWvXLk2YMCHQ5vf7JUnh4eHaunWrzjnnnKBtHA6HHA5HC1QPAADaopDO3ERGRmr48OHKzs4OtPn9fmVnZys9Pf2E/v369dOGDRuUl5cXuFxzzTW6/PLLlZeXxy4nAAAQ2pkbScrMzNSUKVM0YsQIjRo1Ss8++6zKy8s1depUSdLkyZPVrVs3ZWVlKSoqSgMHDgzaPiEhQZJOaAcAAGenkIebSZMmaf/+/Zo5c6bcbreGDh2qZcuWBRYZ5+fnKyysXS0NAgAAIWQzxphQF9GavF6v4uPj5fF45HQ6Q10OAABohKZ8fjMlAgAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALIVwAwAALKVNhJs5c+aoV69eioqK0ujRo7V69eqT9p03b54uueQSdezYUR07dlRGRkaD/QEAwNkl5OFm4cKFyszM1KxZs7R27VoNGTJEY8eOVXFxcb39V65cqZtuukkfffSRcnJylJqaqu9973vau3dvK1cOAADaIpsxxoSygNGjR2vkyJF6/vnnJUl+v1+pqan6+c9/roceeuiU2/t8PnXs2FHPP/+8Jk+efMr+Xq9X8fHx8ng8cjqdZ1w/AABoeU35/A7pzE11dbVyc3OVkZERaAsLC1NGRoZycnIa9RgVFRWqqalRYmJivfdXVVXJ6/UGXQAAgHWFNNyUlJTI5/PJ5XIFtbtcLrnd7kY9xoMPPqiUlJSggHS8rKwsxcfHBy6pqalnXDcAAGi7Qr7m5kw8+eSTWrBggRYtWqSoqKh6+0yfPl0ejydwKSgoaOUqAQBAawoP5ZN37txZdrtdRUVFQe1FRUVKSkpqcNvf//73evLJJ/XPf/5TgwcPPmk/h8Mhh8PRLPUCAIC2L6QzN5GRkRo+fLiys7MDbX6/X9nZ2UpPTz/pdv/7v/+rX//611q2bJlGjBjRGqUCAIB2IqQzN5KUmZmpKVOmaMSIERo1apSeffZZlZeXa+rUqZKkyZMnq1u3bsrKypIk/fa3v9XMmTM1f/589erVK7A2JzY2VrGxsSF7HQAAoG0IebiZNGmS9u/fr5kzZ8rtdmvo0KFatmxZYJFxfn6+wsKOTTD9+c9/VnV1tX74wx8GPc6sWbP02GOPtWbpAACgDQr5eW5aG+e5AQCg/Wk357kBAABoboQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKW0i3MyZM0e9evVSVFSURo8erdWrVzfY/80331S/fv0UFRWlQYMGacmSJa1UKQAAaOtCHm4WLlyozMxMzZo1S2vXrtWQIUM0duxYFRcX19v/s88+00033aTbb79d69at08SJEzVx4kRt3LixlSsHAABtkc0YY0JZwOjRozVy5Eg9//zzkiS/36/U1FT9/Oc/10MPPXRC/0mTJqm8vFzvvfdeoO3CCy/U0KFDNXfu3FM+n9frVXx8vDwej5xOZ/O9EAAA0GKa8vkd3ko11au6ulq5ubmaPn16oC0sLEwZGRnKycmpd5ucnBxlZmYGtY0dO1aLFy9uyVJPzRippiK0NQAA0FZExEg2W0ieOqThpqSkRD6fTy6XK6jd5XJpy5Yt9W7jdrvr7e92u+vtX1VVpaqqqsBtr9d7hlWfRE2F9ERKyzw2AADtzcP7pMgOIXnqkK+5aWlZWVmKj48PXFJTU0NdEgAAaEEhnbnp3Lmz7Ha7ioqKgtqLioqUlJRU7zZJSUlN6j99+vSg3Vher7dlAk5ETF1KBQAAdZ+LIRLScBMZGanhw4crOztbEydOlFS3oDg7O1v33HNPvdukp6crOztb999/f6BtxYoVSk9Pr7e/w+GQw+Fo7tJPZLOFbPoNAAAcE9JwI0mZmZmaMmWKRowYoVGjRunZZ59VeXm5pk6dKkmaPHmyunXrpqysLEnSfffdp8suu0xPPfWUxo8frwULFmjNmjV64YUXQvkyAABAGxHycDNp0iTt379fM2fOlNvt1tChQ7Vs2bLAouH8/HyFhR1bGnTRRRdp/vz5evTRR/Xwww/rvPPO0+LFizVw4MBQvQQAANCGhPw8N62N89wAAND+NOXz2/JHSwEAgLML4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFgK4QYAAFhKyL9bqrV9820TXq83xJUAAIDG+uZzuzHfGnXWhZvS0lJJUmpqaogrAQAATVVaWqr4+PgG+5x1X5zp9/u1b98+xcXFyWazNetje71epaamqqCggC/lbEGMc+tgnFsH49x6GOvW0VLjbIxRaWmpUlJSFBbW8Kqas27mJiwsTN27d2/R53A6nfzitALGuXUwzq2DcW49jHXraIlxPtWMzTdYUAwAACyFcAMAACyFcNOMHA6HZs2aJYfDEepSLI1xbh2Mc+tgnFsPY9062sI4n3ULigEAgLUxcwMAACyFcAMAACyFcAMAACyFcNNM5syZo169eikqKkqjR4/W6tWrQ11Su5aVlaWRI0cqLi5OXbt21cSJE7V169agPpWVlZo2bZo6deqk2NhYXX/99SoqKgpRxdbw5JNPymaz6f777w+0Mc7NZ+/evfrRj36kTp06KTo6WoMGDdKaNWsC9xtjNHPmTCUnJys6OloZGRnatm1bCCtuf3w+n2bMmKHevXsrOjpa55xzjn79618HnbKfcW66jz/+WBMmTFBKSopsNpsWL14cdH9jxvTgwYO65ZZb5HQ6lZCQoNtvv11lZWUtU7DBGVuwYIGJjIw0L774ovnyyy/NnXfeaRISEkxRUVGoS2u3xo4da1566SWzceNGk5eXZ66++mrTo0cPU1ZWFuhz9913m9TUVJOdnW3WrFljLrzwQnPRRReFsOr2bfXq1aZXr15m8ODB5r777gu0M87N4+DBg6Znz57mtttuM6tWrTI7duwwy5cvN9u3bw/0efLJJ018fLxZvHixWb9+vbnmmmtM7969zZEjR0JYefvy+OOPm06dOpn33nvP7Ny507z55psmNjbWPPfcc4E+jHPTLVmyxDzyyCPm7bffNpLMokWLgu5vzJheddVVZsiQIebzzz83n3zyiTn33HPNTTfd1CL1Em6awahRo8y0adMCt30+n0lJSTFZWVkhrMpaiouLjSTzr3/9yxhjzOHDh01ERIR58803A302b95sJJmcnJxQldlulZaWmvPOO8+sWLHCXHbZZYFwwzg3nwcffNBcfPHFJ73f7/ebpKQk87vf/S7QdvjwYeNwOMzrr7/eGiVawvjx482Pf/zjoLbrrrvO3HLLLcYYxrk5fDvcNGZMN23aZCSZL774ItBn6dKlxmazmb179zZ7jeyWOkPV1dXKzc1VRkZGoC0sLEwZGRnKyckJYWXW4vF4JEmJiYmSpNzcXNXU1ASNe79+/dSjRw/G/TRMmzZN48ePDxpPiXFuTu+++65GjBihG264QV27dlVaWprmzZsXuH/nzp1yu91BYx0fH6/Ro0cz1k1w0UUXKTs7W1999ZUkaf369fr00081btw4SYxzS2jMmObk5CghIUEjRowI9MnIyFBYWJhWrVrV7DWddd8t1dxKSkrk8/nkcrmC2l0ul7Zs2RKiqqzF7/fr/vvv15gxYzRw4EBJktvtVmRkpBISEoL6ulwuud3uEFTZfi1YsEBr167VF198ccJ9jHPz2bFjh/785z8rMzNTDz/8sL744gvde++9ioyM1JQpUwLjWd97CWPdeA899JC8Xq/69esnu90un8+nxx9/XLfccoskMc4toDFj6na71bVr16D7w8PDlZiY2CLjTrhBmzdt2jRt3LhRn376aahLsZyCggLdd999WrFihaKiokJdjqX5/X6NGDFCTzzxhCQpLS1NGzdu1Ny5czVlypQQV2cdb7zxhv7+979r/vz5uuCCC5SXl6f7779fKSkpjPNZhN1SZ6hz586y2+0nHD1SVFSkpKSkEFVlHffcc4/ee+89ffTRR0Hf5p6UlKTq6modPnw4qD/j3jS5ubkqLi7WsGHDFB4ervDwcP3rX//SH/7wB4WHh8vlcjHOzSQ5OVkDBgwIauvfv7/y8/MlKTCevJecmV/84hd66KGH9F//9V8aNGiQbr31Vj3wwAPKysqSxDi3hMaMaVJSkoqLi4Pur62t1cGDB1tk3Ak3ZygyMlLDhw9XdnZ2oM3v9ys7O1vp6ekhrKx9M8bonnvu0aJFi/Thhx+qd+/eQfcPHz5cERERQeO+detW5efnM+5NcMUVV2jDhg3Ky8sLXEaMGKFbbrkl8G/GuXmMGTPmhNMZfPXVV+rZs6ckqXfv3kpKSgoaa6/Xq1WrVjHWTVBRUaGwsOCPNrvdLr/fL4lxbgmNGdP09HQdPnxYubm5gT4ffvih/H6/Ro8e3fxFNfsS5bPQggULjMPhMC+//LLZtGmTueuuu0xCQoJxu92hLq3d+ulPf2ri4+PNypUrTWFhYeBSUVER6HP33XebHj16mA8//NCsWbPGpKenm/T09BBWbQ3HHy1lDOPcXFavXm3Cw8PN448/brZt22b+/ve/m5iYGPPaa68F+jz55JMmISHBvPPOO+Y///mP+cEPfsAhyk00ZcoU061bt8Ch4G+//bbp3Lmz+eUvfxnowzg3XWlpqVm3bp1Zt26dkWSefvpps27dOrN7925jTOPG9KqrrjJpaWlm1apV5tNPPzXnnXceh4K3dX/84x9Njx49TGRkpBk1apT5/PPPQ11Suyap3stLL70U6HPkyBHzs5/9zHTs2NHExMSYa6+91hQWFoauaIv4drhhnJvPP/7xDzNw4EDjcDhMv379zAsvvBB0v9/vNzNmzDAul8s4HA5zxRVXmK1bt4ao2vbJ6/Wa++67z/To0cNERUWZPn36mEceecRUVVUF+jDOTffRRx/V+548ZcoUY0zjxvTAgQPmpptuMrGxscbpdJqpU6ea0tLSFqmXbwUHAACWwpobAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAABgKYQbAG3GY489pqFDh4bs+W02mxYvXixJ2rVrl2w2m/Ly8kJWD4DTwxmKAbQZZWVlqqqqUqdOnULy/DabTYsWLdLEiRPl8/m0f/9+de7cWeHh4SGpB8Dp4TcWQJsRGxur2NjYUJchqe6bpJOSkkJdBoDTwG4pAM3mO9/5ju6991798pe/VGJiopKSkvTYY48F7s/Pz9cPfvADxcbGyul06sYbb1RRUVHg/m/vllq5cqVGjRqlDh06KCEhQWPGjNHu3bsD97/zzjsaNmyYoqKi1KdPH82ePVu1tbWNqnXbtm269NJLFRUVpQEDBmjFihVB9397t9TKlStls9m0fPlypaWlKTo6Wt/97ndVXFyspUuXqn///nI6nbr55ptVUVHR9MED0GyYuQHQrF555RVlZmZq1apVysnJ0W233aYxY8boiiuuCASbf/3rX6qtrdW0adM0adIkrVy58oTHqa2t1cSJE3XnnXfq9ddfV3V1tVavXi2bzSZJ+uSTTzR58mT94Q9/0CWXXKKvv/5ad911lyRp1qxZDdbo9/t13XXXyeVyadWqVfJ4PLr//vsb9foee+wxPf/884qJidGNN96oG2+8UQ6HQ/Pnz1dZWZmuvfZa/fGPf9SDDz7YpHED0Ixa5LvGAZyVLrvsMnPxxRcHtY0cOdI8+OCD5oMPPjB2u93k5+cH7vvyyy+NJLN69WpjjDGzZs0yQ4YMMcYYc+DAASPJrFy5st7nuuKKK8wTTzwR1Pa3v/3NJCcnn7LO5cuXm/DwcLN3795A29KlS40ks2jRImOMMTt37jSSzLp164wxxnz00UdGkvnnP/8Z2CYrK8tIMl9//XWg7Sc/+YkZO3bsKWsA0HLYLQWgWQ0ePDjodnJysoqLi7V582alpqYqNTU1cN+AAQOUkJCgzZs3n/A4iYmJuu222zR27FhNmDBBzz33nAoLCwP3r1+/Xr/61a8C63RiY2N15513qrCw8JS7hb6pJSUlJdCWnp7e5NfncrkUExOjPn36BLUVFxc36rEAtAzCDYBmFREREXTbZrPJ7/ef1mO99NJLysnJ0UUXXaSFCxeqb9+++vzzzyXVHVk1e/Zs5eXlBS4bNmzQtm3bFBUVdcav42SOf302m61ZXy+A5sGaGwCton///iooKFBBQUFg9mbTpk06fPiwBgwYcNLt0tLSlJaWpunTpys9PV3z58/XhRdeqGHDhmnr1q0699xzT7uWwsJCJScnS1IgNAFo/wg3AFpFRkaGBg0apFtuuUXPPvusamtr9bOf/UyXXXaZRowYcUL/nTt36oUXXtA111yjlJQUbd26Vdu2bdPkyZMlSTNnztT3v/999ejRQz/84Q8VFham9evXa+PGjfrNb35zylr69u2rKVOm6He/+528Xq8eeeSRFnndAFofu6UAtAqbzaZ33nlHHTt21KWXXqqMjAz16dNHCxcurLd/TEyMtmzZouuvv159+/bVXXfdpWnTpuknP/mJJGns2LF677339MEHH2jkyJG68MIL9cwzz6hnz56nrCUsLEyLFi3SkSNHNGrUKN1xxx16/PHHm/X1AggdzlAMAAAshZkbAABgKYQbAJbz97//PegQ8eMvF1xwQajLA9DC2C0FwHJKS0uDvtbheBEREY1alwOg/SLcAAAAS2G3FAAAsBTCDQAAsBTCDQAAsBTCDQAAsBTCDQAAsBTCDQAAsBTCDQAAsBTCDQAAsJT/D6NtcABOUtH5AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots()\n", + "sns.lineplot(data=result_df, x=\"noise_dim\", y=\"uf\", ax=ax)\n", + "sns.lineplot(data=result_df, x=\"noise_dim\", y=\"true\", ax=ax)\n", + "sns.lineplot(data=result_df, x=\"noise_dim\", y=\"ksg\", ax=ax)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02d99c21-6874-48e8-8a51-ed923646f63a", + "metadata": {}, "outputs": [], "source": [] } diff --git a/sktree/experimental/__init__.py b/sktree/experimental/__init__.py index 0b4eb9304..317305dac 100644 --- a/sktree/experimental/__init__.py +++ b/sktree/experimental/__init__.py @@ -1,4 +1,6 @@ from . import mutual_info, simulate +from .forest import SupervisedInfoForest +from .ksg import entropy_continuous, mutual_info_ksg from .mutual_info import ( cmi_from_entropy, cmi_gaussian, @@ -8,4 +10,3 @@ mi_gamma, mi_gaussian, ) -from .ksg import mutual_info_ksg \ No newline at end of file diff --git a/sktree/experimental/forest.py b/sktree/experimental/forest.py new file mode 100644 index 000000000..ff51e694d --- /dev/null +++ b/sktree/experimental/forest.py @@ -0,0 +1,67 @@ +from copy import copy + +import numpy as np +from scipy.stats import entropy +from sklearn.base import BaseEstimator, MetaEstimatorMixin + +from .ksg import entropy_continuous + + +class SupervisedInfoForest(BaseEstimator, MetaEstimatorMixin): + def __init__(self, estimator, y_categorical: bool = False, n_jobs=None): + self.estimator = estimator + self.y_categorical = y_categorical + self.n_jobs = n_jobs + + def fit(self, X, y, Z=None): + X, y = self._validate_data(X, y, accept_sparse="csc") + + print(X.shape, y.shape) + self.estimator_yxz_ = copy(self.estimator) + self.estimator_yz_ = copy(self.estimator) + + if Z is not None: + XZ = np.hstack((X, Z)) + else: + XZ = X + + # compute the entropy of H(Y | X, Z) + self.estimator_yxz_.fit(XZ, y) + + # compute the entropy of H(Y | Z) + if Z is None: + # compute entropy using empirical distribution + if self.y_categorical: + _, self.counts_ = np.unique(y, return_counts=True) + else: + self.H_yz_ = entropy_continuous(y, k=5, n_jobs=self.n_jobs) + else: + self.estimator_yz_.fit(Z, y) + + return self + + def predict_cmi(self, X, Z=None): + if Z is not None: + X, Z = self._validate_data(X, Z, accept_sparse="csc") + else: + X = self._validate_data(X, accept_sparse="csc") + + if Z is not None: + XZ = np.hstack((X, Z)) + else: + XZ = X + + # compute the entropy of H(Y | X, Z) + H_yxz = self.estimator_yxz_.predict_proba(XZ) + + # compute the entropy of H(Y | Z) + if Z is None: + if self.y_categorical: + # compute entropy using empirical distribution + H_yz = entropy(self.counts_, base=np.exp(1)) + else: + H_yz = self.H_yz_ + else: + H_yz = self.estimator_yz_.predict_proba(Z) + + return H_yxz - H_yz diff --git a/sktree/experimental/ksg.py b/sktree/experimental/ksg.py index 8b19c39a4..2ab35d9cf 100644 --- a/sktree/experimental/ksg.py +++ b/sktree/experimental/ksg.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Tuple import numpy as np import scipy.linalg @@ -10,16 +10,138 @@ from sklearn.preprocessing import StandardScaler +def entropy_continuous(X, k=1, norm="max", min_dist=0.0, n_jobs=-1): + """Estimates the entropy H of a continuous random variable. + + The approach uses the kth-nearest neighbour distances between sample points + from :footcite:`kozachenko1987sample`. See: https://github.com/paulbrodersen/entropy_estimators/blob/master/ + + Parameters + ---------- + X : ArrayLike of shape (n_samples, n_features) + N samples from a d-dimensional multivariate distribution + + k : int (default 1) + kth nearest neighbour to use in density estimate; + imposes smoothness on the underlying probability distribution + + norm : 'euclidean' or 'max' + p-norm used when computing k-nearest neighbour distances + + min_dist : float (default 0.) + minimum distance between data points; + smaller distances will be capped using this value + + n_jobs : int (default 1) + number of workers to use for parallel processing in query; + -1 uses all CPU threads + + Returns + ------- + h: float + entropy H(X) + + References + ---------- + .. footbibliography:: + """ + if X.ndim == 1: + X = X[:, np.newaxis] + n, d = X.shape + + if norm == "max": # max norm: + p = np.inf + log_c_d = 0 # volume of the d-dimensional unit ball + elif norm == "euclidean": # euclidean norm + p = 2 + log_c_d = (d / 2.0) * np.log(np.pi) - np.log(scipy.special.gamma(d / 2.0 + 1)) + else: + raise NotImplementedError("Variable 'norm' either 'max' or 'euclidean'") + + kdtree = scipy.spatial.KDTree(X) + + # query all points -- k+1 as query point also in initial set + # distances, _ = kdtree.query(x, k + 1, eps=0, p=norm) + distances, _ = kdtree.query(X, k + 1, eps=0, p=p, workers=n_jobs) + distances = distances[:, -1] + + # enforce non-zero distances + distances[distances < min_dist] = min_dist + + sum_log_dist = np.sum(np.log(2.0 * distances)) # where did the 2 come from? radius -> diameter + h = ( + -scipy.special.digamma(k) + + scipy.special.digamma(n) + + log_c_d + + (d / float(n)) * sum_log_dist + ) + + return h + + +def mutual_info_ksg_nn( + X, + Y, + Z=None, + k: float = 0.2, + norm='max', + n_jobs: int = -1, + transform: str = "rank", + random_seed: int = None, + verbose: bool = False, +): + rng = np.random.default_rng(random_seed) + + data = np.hstack((X, Y)) + x_idx = np.arange(X.shape[1]) + y_idx = np.arange(Y.shape[1]) + X.shape[1] + z_idx = np.array([]) + if Z is not None: + z_idx = np.arange(Z.shape[1]) + data.shape[1] + data = np.hstack((data, Z)) + + data = _preprocess_data(data, transform, rng) + if verbose: + print(f"Data shape: {data.shape}") + print(f"X shape: {X.shape}, Y shape: {Y.shape}") + if Z is not None: + print(f"Z shape: {Z.shape}") + print("Preprocessing complete.") + n_samples = data.shape[0] + + # get the number of neighbors to use in estimating the CMI + if k < 1: + knn_here = max(1, int(k * n_samples)) + else: + knn_here = max(1, int(k)) + + if verbose: + print(f"Using {knn_here} neighbors to define D-dimensional volume.") + + if Z is not None: + val = _cmi_ksg_scipy(data, x_idx, y_idx, z_idx, knn_here, n_jobs=n_jobs) + else: + val = _mi_ksg_scipy(data, x_idx, y_idx, knn_here, n_jobs=n_jobs) + + if norm == 'max': + norm_constant = np.log(1) + else: + norm_constant = np.log(np.pi ** (data.shape[1] / 2) / scipy.special.gamma(data.shape[1] / 2 + 1) / 2 ** data.shape[1]) + + return val + norm_constant + + def mutual_info_ksg( X, Y, Z=None, k: float = 0.2, + norm='max', nn_estimator=None, n_jobs: int = -1, transform: str = "rank", random_seed: int = None, - verbose: bool=False + verbose: bool = False, ): """Compute the generalized (conditional) mutual information KSG estimate. @@ -36,6 +158,12 @@ def mutual_info_ksg( The number of neighbors to use in defining the radius, by default 0.2. If a number less than 1, then the number of neighbors is computed as ``k * n_samples``. + norm : str, optional, {'max', 'euclidean'} + The norm to use in computing the distance, by default 'max'. This is the norm + used for computing the distance between points in the covariate space and + affects the constant term in the KSG estimator. For 'max' norm, the constant + term is :math:`\\log(1) = 0`, and for 'euclidean' norm, the constant term is + :math:`\\pi^{d/2} / \\Gamma(d/2 + 1) / 2^d)`, where :math:`d` is the dimension. nn_estimator : str, optional The nearest neighbor estimator to use, by default None. If None willl default to using :class:`sklearn.neighbors.NearestNeighbors` with default parameters. @@ -84,10 +212,10 @@ def mutual_info_ksg( ball with a specified radius. The larger the k, the higher the bias in the estimate, but the lower the variance. The smaller the k, the lower the bias, but the higher the variance. The default value of 0.2 is set - to allow scaling with the number of samples. + to allow scaling with the number of samples. """ rng = np.random.default_rng(random_seed) - + if nn_estimator is None: nn_estimator = NearestNeighbors(n_jobs=n_jobs) @@ -105,7 +233,7 @@ def mutual_info_ksg( print(f"X shape: {X.shape}, Y shape: {Y.shape}") if Z is not None: print(f"Z shape: {Z.shape}") - print('Preprocessing complete.') + print("Preprocessing complete.") n_samples = data.shape[0] # get the number of neighbors to use in estimating the CMI @@ -121,7 +249,13 @@ def mutual_info_ksg( val = _cmi_ksg(data, x_idx, y_idx, z_idx, nn_estimator, knn_here) else: val = _mi_ksg(data, x_idx, y_idx, nn_estimator, knn_here) - return val + + if norm == 'max': + norm_constant = np.log(1) + else: + norm_constant = np.log(np.pi ** (data.shape[1] / 2) / scipy.special.gamma(data.shape[1] / 2 + 1) / 2 ** data.shape[1]) + + return val + norm_constant def _preprocess_data(data, transform, rng): @@ -144,7 +278,83 @@ def _preprocess_data(data, transform, rng): return data -def _mi_ksg(data, x_idx, y_idx, nn_estimator: BaseEstimator, knn_here: int, verbose: bool=False)-> float: +def _cmi_ksg_scipy( + data, x_idx, y_idx, z_idx, knn_here: int, n_jobs: int=-1 +) -> float: + tree_xyz = scipy.spatial.KDTree(data) + radius_per_sample = tree_xyz.query( + data, k=[knn_here + 1], p=np.inf, eps=0.0, workers=n_jobs + )[0][:, 0].astype(np.float64) + + # To search neighbors < eps + radius_per_sample = np.multiply(radius_per_sample, 0.99999) + + # compute on the subspace of X, Z + xz_idx = np.concatenate((x_idx, z_idx)).squeeze() + tree_xz = scipy.spatial.KDTree(data[:, xz_idx]) + num_nn_xz = tree_xz.query_ball_point( + data[:, xz_idx], r=radius_per_sample, eps=0.0, p=np.inf, workers=n_jobs, return_length=True + ) + + # compute on the subspace of Y, Z + yz_idx = np.concatenate((y_idx, z_idx)).squeeze() + tree_yz = scipy.spatial.KDTree(data[:, yz_idx]) + num_nn_yz = tree_yz.query_ball_point( + data[:, yz_idx], r=radius_per_sample, eps=0.0, p=np.inf, workers=n_jobs, return_length=True + ) + + tree_z = scipy.spatial.KDTree(data[:, z_idx]) + num_nn_z = tree_z.query_ball_point( + data[:, z_idx], r=radius_per_sample, eps=0.0, p=np.inf, workers=n_jobs, return_length=True + ) + + # compute the final CMI value + hxyz = scipy.special.digamma(knn_here) + hxz = scipy.special.digamma(num_nn_xz) + hyz = scipy.special.digamma(num_nn_yz) + hz = scipy.special.digamma(num_nn_z) + val = hxyz - (hxz + hyz - hz).mean() + return val + + +def _mi_ksg_scipy( + data, x_idx, y_idx, knn_here: int, n_jobs: int=-1 +) -> float: + n_samples = data.shape[0] + + tree_xyz = scipy.spatial.KDTree(data) + radius_per_sample = tree_xyz.query( + data, k=[knn_here + 1], p=np.inf, eps=0.0, workers=n_jobs + )[0][:, 0].astype(np.float64) + + # To search neighbors < eps + radius_per_sample = np.multiply(radius_per_sample, 0.99999) + + # compute on the subspace of X + tree_x = scipy.spatial.KDTree(data[:, x_idx]) + num_nn_x = tree_x.query_ball_point( + data[:, x_idx], r=radius_per_sample, eps=0.0, p=np.inf, workers=n_jobs, return_length=True + ) + + # compute on the subspace of Y + tree_y = scipy.spatial.KDTree(data[:, y_idx]) + num_nn_y = tree_y.query_ball_point( + data[:, x_idx], r=radius_per_sample, eps=0.0, p=np.inf, workers=n_jobs, return_length=True + ) + + # compute the final MI value + # \\psi(k) - E[(\\psi(n_x) + \\psi(n_y))] + \\psi(n) + hxy = scipy.special.digamma(knn_here) + hx = scipy.special.digamma(num_nn_x) + hy = scipy.special.digamma(num_nn_y) + hn = scipy.special.digamma(n_samples) + val = hxy - (hx + hy).mean() + hn + return val + + +def _mi_ksg( + data, x_idx, y_idx, nn_estimator: BaseEstimator, knn_here: int +) -> float: """Compute KSG estimate of MI. Parameters @@ -169,7 +379,7 @@ def _mi_ksg(data, x_idx, y_idx, nn_estimator: BaseEstimator, knn_here: int, verb n_samples = data.shape[0] # estimate distance to the kth NN in XYZ subspace for each sample - neigh = nn_estimator.fit(data, force_fit=True) + neigh = nn_estimator.fit(data) dists, _ = neigh.kneighbors(n_neighbors=knn_here) # - get the radius we want to use per sample as the distance to the kth neighbor @@ -177,20 +387,10 @@ def _mi_ksg(data, x_idx, y_idx, nn_estimator: BaseEstimator, knn_here: int, verb radius_per_sample = dists[:, -1] # compute on the subspace of X - num_nn_x = _compute_radius_nbrs( - data, - radius_per_sample, - nn_estimator, - col_idx=x_idx - ) + num_nn_x = _compute_radius_nbrs(data, radius_per_sample, nn_estimator, col_idx=x_idx) # compute on the subspace of Y - num_nn_y = _compute_radius_nbrs( - data, - radius_per_sample, - nn_estimator, - col_idx=y_idx - ) + num_nn_y = _compute_radius_nbrs(data, radius_per_sample, nn_estimator, col_idx=y_idx) # compute the final MI value # \\psi(k) - E[(\\psi(n_x) + \\psi(n_y))] + \\psi(n) @@ -202,7 +402,9 @@ def _mi_ksg(data, x_idx, y_idx, nn_estimator: BaseEstimator, knn_here: int, verb return val -def _cmi_ksg(data: ArrayLike, x_idx, y_idx, z_idx, nn_estimator: BaseEstimator, knn_here: int) -> float: +def _cmi_ksg( + data: ArrayLike, x_idx, y_idx, z_idx, nn_estimator: BaseEstimator, knn_here: int +) -> float: """Compute KSG estimate of CMI. Parameters @@ -227,7 +429,7 @@ def _cmi_ksg(data: ArrayLike, x_idx, y_idx, z_idx, nn_estimator: BaseEstimator, Estimated CMI value. """ # estimate distance to the kth NN in XYZ subspace for each sample - neigh = nn_estimator.fit(data, force_fit=True) + neigh = nn_estimator.fit(data) # get the radius we want to use per sample as the distance to the kth neighbor # in the joint distribution space @@ -275,15 +477,23 @@ def _compute_radius_nbrs( nn_estimator: BaseEstimator, col_idx: Optional[ArrayLike] = None, ): - n_samples = radius_per_sample.shape[0] - # compute distances in the subspace defined by data - nn_estimator.fit(data[:, col_idx], force_fit=True) + nn_estimator.fit(data[:, col_idx]) - num_nn_data = np.zeros((n_samples,)) - for idx in range(n_samples): - nn = nn_estimator.radius_neighbors(radius=radius_per_sample[idx], return_distance=False) - num_nn_data[idx] = len(nn) + # compute the radius neighbors for each sample + if getattr(nn_estimator, '_supports_multi_radii', False): + nn = nn_estimator.radius_neighbors( + X=data[:, col_idx], radius=radius_per_sample, return_distance=False + ) + else: + nn = [] + for idx, radius in enumerate(radius_per_sample): + nn_ = nn_estimator.radius_neighbors( + X=np.atleast_2d(data[:, col_idx]), radius=radius, return_distance=False + ) + nn.append(nn_[idx]) + + num_nn_data = np.array([len(nn_) for nn_ in nn]) return num_nn_data diff --git a/sktree/experimental/meson.build b/sktree/experimental/meson.build index 3d0620146..ba2b5f4e7 100644 --- a/sktree/experimental/meson.build +++ b/sktree/experimental/meson.build @@ -3,6 +3,7 @@ python_sources = [ 'mutual_info.py', 'simulate.py', 'ksg.py', + 'forest.py', ] py3.install_sources( diff --git a/sktree/experimental/simulate.py b/sktree/experimental/simulate.py index 56146b16c..2eace859c 100644 --- a/sktree/experimental/simulate.py +++ b/sktree/experimental/simulate.py @@ -2,7 +2,8 @@ import scipy.linalg import scipy.special import scipy.stats - +from scipy.stats import entropy, multivariate_normal +from scipy.integrate import nquad def simulate_helix( radius_a=0, @@ -226,11 +227,110 @@ def simulate_multivariate_gaussian(mean=None, cov=None, d=2, n_samples=1000, see def embed_high_dims(data, n_dims=50, random_state=None): rng = np.random.default_rng(random_state) - new_data = np.zeros((data.shape[0], n_dims + data.shape[1])) - new_data[:, :data.shape[1]] = data + new_data = np.zeros((data.shape[0], n_dims + data.shape[1])) + new_data[:, : data.shape[1]] = data for idim in range(n_dims): new_col = rng.standard_normal(size=(data.shape[0],)) new_data[:, data.shape[1] + idim] = new_col - return new_data \ No newline at end of file + return new_data + + +def simulate_separate_gaussians(n_dims=2, n_samples=1000, n_classes=2, pi=None, seed=None): + """Simulate data from separate multivariate Gaussians. + + Parameters + ---------- + n_dims : int + The dimensionality of the data. The default is 2. + n_samples : int + The number of samples to generate. The default is 1000. + n_classes : int + The number of classes to generate. The default is 2. + pi : array-like of shape (n_classes,) + The class probabilities. If None (default), then uniform class probabilities are used. + seed : int + The random seed to feed to :func:`numpy.random.default_rng`. The default is None. + + Returns + ------- + data : array-like of shape (n_samples, n_dims) + The generated data. + y : array-like of shape (n_samples,) + The class labels. + means : list of array-like of shape (n_dims,) + The means of the Gaussians from each class. + sigmas : list of array-like of shape (n_dims, n_dims) + The covariance matrices of the Gaussians from each class. + pi : array-like of shape (n_classes,) + The class probabilities. + I_XY : float + The ground-truth mutual information between the class labels and the data. + + Notes + ----- + This simulates data from separate multivariate Gaussians, where each class has its own + multivariate Gaussian distribution. The class labels are sampled from a multinomial distribution + with probabilities `pi`. + + The ground-truth computation of the MI depends on + """ + rng = np.random.default_rng(seed) + + if pi is None: + pi = np.ones((n_classes,)) / n_classes + else: + if len(pi) != n_classes: + raise RuntimeError(f"pi should be of length {n_classes}") + + # first sample the class labels according to class probabilities + counts = rng.multinomial(n_samples, pi, size = 1)[0] + + # now sample the multivariate Gaussian for each class + means = [np.zeros((n_dims,))] + sigmas = [np.eye(n_dims)] + for _ in range(1, n_classes): + mean = rng.standard_normal(size=(n_dims,)) + sigma = np.eye(n_dims) + + means.append(mean) + sigmas.append(sigma) + + # now sample the data + X_data = [] + y_data = [] + for k in range(n_classes): + X_data.append(rng.multivariate_normal(means[k], sigmas[k], counts[k])) + y_data.append(np.repeat(k, counts[k])) + X = np.concatenate(tuple(X_data)) + y = np.concatenate(tuple(y_data)) + + # compute ground-truth MI + base = np.exp(1) + H_Y = entropy(pi, base=base) + + def func(*args): + # points at which to evaluate the multivariate-Gaussian + x_points = np.array(args) + + # compute the probability of the multivariate-Gaussian at various points in the + # d-dimensional space + p = 0.0 + for k in range(n_classes): + p += pi[k] * multivariate_normal.pdf(x_points, mean=means[k], cov=sigmas[k], seed=seed) + + # compute the log-probability + return -p * np.log(p) / np.log(base) + + # limits over each dimension of the multivariate-Gaussian + lims = [[-10, 10]]*n_dims + H_X = nquad(func, range=lims) + + # now compute H(Y|X) + H_YX = 0.0 + for k in range(n_classes): + # [d * log(2 * pi) + log(det(sigma)) + d] / (2 * log(base)) + H_YX += pi[k] * (n_dims * np.log(2*np.pi) + np.log(np.linalg.det(sigmas[k])) + n_dims) / (2. * np.log(base)) + I_XY = H_Y + H_X - H_YX + return X, y, means, sigmas, pi, I_XY \ No newline at end of file diff --git a/sktree/experimental/tests/meson.build b/sktree/experimental/tests/meson.build index ac5c5d40f..7f240c2b4 100644 --- a/sktree/experimental/tests/meson.build +++ b/sktree/experimental/tests/meson.build @@ -2,6 +2,7 @@ python_sources = [ '__init__.py', 'test_mutual_info.py', 'test_simulate.py', + 'test_ksg.py', ] py3.install_sources( diff --git a/sktree/experimental/tests/test_ksg.py b/sktree/experimental/tests/test_ksg.py new file mode 100644 index 000000000..c2be62114 --- /dev/null +++ b/sktree/experimental/tests/test_ksg.py @@ -0,0 +1,51 @@ +import itertools + +import numpy as np + +from sktree.experimental.ksg import entropy_continuous +from sktree.experimental.mutual_info import entropy_gaussian + +seed = 12345 + +rng = np.random.default_rng(seed) + + +def get_mvn_data(total_rvs, dimensionality=2, scale_sigma_offdiagonal_by=1.0, total_samples=1000): + data_space_size = total_rvs * dimensionality + + # initialise distribution + mu = rng.standard_normal(data_space_size) + sigma = rng.rand(data_space_size, data_space_size) + + # ensures that sigma is positive semi-definite + sigma = np.dot(sigma.transpose(), sigma) + + # scale off-diagonal entries -- might want to change that to block diagonal entries + # diag = np.diag(sigma).copy() + # sigma *= scale_sigma_offdiagonal_by + # sigma[np.diag_indices(len(diag))] = diag + + # scale off-block diagonal entries + d = dimensionality + for ii, jj in itertools.product(list(range(total_rvs)), repeat=2): + if ii != jj: + sigma[d * ii : d * (ii + 1), d * jj : d * (jj + 1)] *= scale_sigma_offdiagonal_by + + # get samples + samples = rng.multivariate_normal(mu, sigma, size=total_samples) + + return [samples[:, ii * d : (ii + 1) * d] for ii in range(total_rvs)] + + +def test_get_h_1d(k=5, norm="max"): + X = rng.standard_normal(size=(1000, 1)) + cov_X = np.atleast_2d(np.cov(X.T)) + + analytic = entropy_gaussian(cov_X) + kozachenko = entropy_continuous(X, k=k, norm=norm) + + print("analytic result: {:.5f}".format(analytic)) + print("K-L estimator: {:.5f}".format(kozachenko)) + assert np.isclose( + analytic, kozachenko, rtol=0.1, atol=0.1 + ), "K-L estimate strongly differs from analytic expectation!" diff --git a/sktree/neighbors.py b/sktree/neighbors.py index 653e7ff97..e4291882e 100644 --- a/sktree/neighbors.py +++ b/sktree/neighbors.py @@ -1,15 +1,31 @@ import numbers from copy import copy +from joblib import Parallel, delayed import numpy as np from sklearn.base import BaseEstimator, MetaEstimatorMixin from sklearn.exceptions import NotFittedError +from sklearn.metrics import DistanceMetric from sklearn.neighbors import NearestNeighbors from sklearn.utils.validation import check_is_fitted from sktree.tree._neighbors import _compute_distance_matrix, compute_forest_similarity_matrix +def forest_distance(clf, X, Y) -> float: + X = np.atleast_2d(X) + Y = np.atleast_2d(Y) + XY = np.concatenate((X, Y), axis=0) + aff_matrix = compute_forest_similarity_matrix(clf, XY) + + # dists should be (2, 2) + dists = _compute_distance_matrix(aff_matrix) + if dists.shape != (2, 2): + raise RuntimeError("This shouldnt happen") + + return dists[0, 1] + + class NearestNeighborsMetaEstimator(BaseEstimator, MetaEstimatorMixin): """Meta-estimator for nearest neighbors. @@ -29,16 +45,30 @@ class NearestNeighborsMetaEstimator(BaseEstimator, MetaEstimatorMixin): See :class:`sklearn.neighbors.NearestNeighbors` for details. n_jobs : int, optional The number of parallel jobs to run for neighbors, by default None. + force_fit : bool, optional + If True, the estimator will be fit even if it is already fitted, by default False. """ - - def __init__(self, estimator, n_neighbors=5, radius=1.0, algorithm="auto", n_jobs=None): + _supports_multi_radii: bool = True + + def __init__( + self, + estimator, + n_neighbors=5, + radius=1.0, + algorithm="auto", + n_jobs=None, + force_fit=False, + verbose: bool = False, + ): self.estimator = estimator self.n_neighbors = n_neighbors self.algorithm = algorithm self.radius = radius self.n_jobs = n_jobs + self.force_fit = force_fit + self.verbose = verbose - def fit(self, X, y=None, force_fit=False): + def fit(self, X, y=None): """Fit the nearest neighbors estimator from the training dataset. Parameters @@ -62,7 +92,7 @@ def fit(self, X, y=None, force_fit=False): X = self._validate_data(X, accept_sparse="csc") self.estimator_ = copy(self.estimator) - if force_fit: + if self.force_fit: self.estimator_.fit(X, y) else: try: @@ -70,7 +100,17 @@ def fit(self, X, y=None, force_fit=False): except NotFittedError: self.estimator_.fit(X, y) - self._fit(X, self.n_neighbors) + if self.verbose: + print(f"Finished fitting estimator: {self.estimator_}") + + # get the number of neighbors to use in estimating the CMI + n_samples = X.shape[0] + if self.n_neighbors < 1: + knn_here = max(1, int(self.n_neighbors * n_samples)) + else: + knn_here = max(1, int(self.n_neighbors)) + + self._fit(X, knn_here) return self def _fit(self, X, n_neighbors): @@ -82,12 +122,20 @@ def _fit(self, X, n_neighbors): ) # compute the distance matrix - aff_matrix = compute_forest_similarity_matrix(self.estimator_, X) - dists = _compute_distance_matrix(aff_matrix) + dists = self._compute_distance_matrix(X) + + if self.verbose: + print(f"Finished computing distance matrix: {dists.shape}") # fit the nearest-neighbors estimator self.neigh_est_.fit(dists) + def _compute_distance_matrix(self, X): + # compute the distance matrix + aff_matrix = compute_forest_similarity_matrix(self.estimator_, X) + dists = _compute_distance_matrix(aff_matrix) + return dists + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Find the K-neighbors of a point. @@ -188,26 +236,54 @@ def radius_neighbors(self, X=None, radius=None, return_distance=True, sort_resul if X is not None: n_samples = X.shape[0] + X = self._compute_distance_matrix(X) + + if self.verbose: + print(f"Finished computing distance matrix: {X.shape}") else: n_samples = self.neigh_est_.n_samples_fit_ - if isinstance(radius, numbers.Number): - radius = [radius] * n_samples - # now construct nearest neighbor indices and distances within radius - nn_ind_data = np.zeros((n_samples,), dtype=object) - nn_dist_data = np.zeros((n_samples,), dtype=object) - for idx in range(n_samples): - nn = self.neigh_est_.radius_neighbors( - X=X, radius=radius[idx], return_distance=return_distance, sort_results=sort_results + if self.verbose: + print(f"Computing radius neighbors for {n_samples} samples") + + if isinstance(radius, numbers.Number): + # return only neighbors within one fixed radius across all samples + return self.neigh_est_.radius_neighbors( + X=X, radius=radius, return_distance=return_distance, sort_results=sort_results ) + else: + # forest-based nearest neighbors needs to support all samples to get pairwise + # distances before querying the radius neighbors API + if X is None: + raise RuntimeError("Must provide X if radius is an array of numbers") + + if len(radius) != n_samples: + raise RuntimeError(f"Expected {n_samples} radius values, got {len(radius)}") + + nn_inds_arr = np.zeros((n_samples,), dtype=object) + nn_dist_arr = np.zeros((n_samples,), dtype=object) + + # compute radius neighbors for each sample in parallel + if self.verbose: + print('Computing radius neighbors in parallel...') + + result = Parallel(n_jobs=self.n_jobs)(delayed(_parallel_radius_nbrs)( + self.neigh_est_.radius_neighbors, np.atleast_2d(X[idx, :]), radius[idx], return_distance + ) for idx in range(n_samples)) + + for idx, nn in enumerate(result): + if return_distance: + nn_inds_arr[idx] = nn[0][0] + nn_dist_arr[idx] = nn[1][0] + else: + nn_inds_arr[idx] = nn[0] if return_distance: - nn_ind_data[idx] = nn[0][idx] - nn_dist_data[idx] = nn[1][idx] + return nn_dist_arr, nn_inds_arr else: - nn_ind_data[idx] = nn + return nn_inds_arr + - if return_distance: - return nn_dist_data, nn_ind_data - return nn_ind_data +def _parallel_radius_nbrs(radius_nbr_func, X, radius, return_distance): + return radius_nbr_func(X, radius, return_distance) \ No newline at end of file