From 72d3ed765a6f6e7b31b5659c197fa30f53f96054 Mon Sep 17 00:00:00 2001 From: j-bac Date: Wed, 25 May 2022 22:58:58 +0200 Subject: [PATCH 1/4] cluster --- README.md | 100 +-- skdim/__init__.py | 1 + skdim/_commonfuncs.py | 53 +- skdim/cluster.py | 82 +++ skdim/datasets.py | 1 + skdim/id/_FisherS.py | 1406 ++++++++++++++++++++--------------------- skdim/id/_PCA.py | 550 ++++++++-------- 7 files changed, 1119 insertions(+), 1074 deletions(-) create mode 100644 skdim/cluster.py diff --git a/README.md b/README.md index 11dc7e3..4c1d70b 100644 --- a/README.md +++ b/README.md @@ -1,50 +1,50 @@ -[![Build status](https://ci.appveyor.com/api/projects/status/tvumlfad69g6ap3u/branch/master?svg=true)](https://ci.appveyor.com/project/j-bac/scikit-dimension/branch/master) -[![CircleCI](https://circleci.com/gh/j-bac/scikit-dimension/tree/master.svg?style=shield)](https://circleci.com/gh/j-bac/scikit-dimension/tree/master) -[![Documentation Status](https://readthedocs.org/projects/scikit-dimension/badge/?version=latest)](https://scikit-dimension.readthedocs.io/en/latest/?badge=latest) -[![codecov](https://codecov.io/gh/j-bac/scikit-dimension/branch/master/graph/badge.svg)](https://codecov.io/gh/j-bac/scikit-dimension) -[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/j-bac/scikit-dimension.svg?logo=lgtm&logoWidth=18&label=%20code%20quality)](https://lgtm.com/projects/g/j-bac/scikit-dimension/context:python) -[![GitHub license](https://img.shields.io/github/license/j-bac/scikit-dimension)](https://github.com/j-bac/scikit-dimension/blob/master/LICENSE) -[![Downloads](https://pepy.tech/badge/scikit-dimension)](https://pepy.tech/project/scikit-dimension) - -# scikit-dimension - -scikit-dimension is a Python module for intrinsic dimension estimation built according to the [scikit-learn](https://github.com/scikit-learn/scikit-learn) API and distributed under the 3-Clause BSD license. - -Please refer to the [documentation](https://scikit-dimension.readthedocs.io) and the [paper](https://www.mdpi.com/1099-4300/23/10/1368) for detailed API, examples and references - -### Installation - -Using pip: -```bash -pip install scikit-dimension -``` - -From source: -```bash -git clone https://github.com/j-bac/scikit-dimension -cd scikit-dimension -pip install . -``` - -### Quick start - -Local and global estimators can be used in this way: - -```python -import skdim -import numpy as np - -#generate data : np.array (n_points x n_dim). Here a uniformly sampled 5-ball embedded in 10 dimensions -data = np.zeros((1000,10)) -data[:,:5] = skdim.datasets.hyperBall(n = 1000, d = 5, radius = 1, random_state = 0) - -#estimate global intrinsic dimension -danco = skdim.id.DANCo().fit(data) -#estimate local intrinsic dimension (dimension in k-nearest-neighborhoods around each point): -lpca = skdim.id.lPCA().fit_pw(data, - n_neighbors = 100, - n_jobs = 1) - -#get estimated intrinsic dimension -print(danco.dimension_, np.mean(lpca.dimension_pw_)) -``` +[![Build status](https://ci.appveyor.com/api/projects/status/tvumlfad69g6ap3u/branch/master?svg=true)](https://ci.appveyor.com/project/j-bac/scikit-dimension/branch/master) +[![CircleCI](https://circleci.com/gh/j-bac/scikit-dimension/tree/master.svg?style=shield)](https://circleci.com/gh/j-bac/scikit-dimension/tree/master) +[![Documentation Status](https://readthedocs.org/projects/scikit-dimension/badge/?version=latest)](https://scikit-dimension.readthedocs.io/en/latest/?badge=latest) +[![codecov](https://codecov.io/gh/j-bac/scikit-dimension/branch/master/graph/badge.svg)](https://codecov.io/gh/j-bac/scikit-dimension) +[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/j-bac/scikit-dimension.svg?logo=lgtm&logoWidth=18&label=%20code%20quality)](https://lgtm.com/projects/g/j-bac/scikit-dimension/context:python) +[![GitHub license](https://img.shields.io/github/license/j-bac/scikit-dimension)](https://github.com/j-bac/scikit-dimension/blob/master/LICENSE) +[![Downloads](https://pepy.tech/badge/scikit-dimension)](https://pepy.tech/project/scikit-dimension) + +# scikit-dimension + +scikit-dimension is a Python module for intrinsic dimension estimation built according to the [scikit-learn](https://github.com/scikit-learn/scikit-learn) API and distributed under the 3-Clause BSD license. + +Please refer to the [documentation](https://scikit-dimension.readthedocs.io) and the [paper](https://www.mdpi.com/1099-4300/23/10/1368) for detailed API, examples and references + +### Installation + +Using pip: +```bash +pip install scikit-dimension +``` + +From source: +```bash +git clone https://github.com/j-bac/scikit-dimension +cd scikit-dimension +pip install . +``` + +### Quick start + +Local and global estimators can be used in this way: + +```python +import skdim +import numpy as np + +#generate data : np.array (n_points x n_dim). Here a uniformly sampled 5-ball embedded in 10 dimensions +data = np.zeros((1000,10)) +data[:,:5] = skdim.datasets.hyperBall(n = 1000, d = 5, radius = 1, random_state = 0) + +#estimate global intrinsic dimension +danco = skdim.id.DANCo().fit(data) +#estimate local intrinsic dimension (dimension in k-nearest-neighborhoods around each point): +lpca = skdim.id.lPCA().fit_pw(data, + n_neighbors = 100, + n_jobs = 1) + +#get estimated intrinsic dimension +print(danco.dimension_, np.mean(lpca.dimension_pw_)) +``` diff --git a/skdim/__init__.py b/skdim/__init__.py index c0f5430..6cf6ff9 100644 --- a/skdim/__init__.py +++ b/skdim/__init__.py @@ -31,5 +31,6 @@ # from . import datasets from . import id +from . import cluster from ._commonfuncs import get_nn, asPointwise from ._version import __version__ diff --git a/skdim/_commonfuncs.py b/skdim/_commonfuncs.py index 11b16a3..14c9a0e 100644 --- a/skdim/_commonfuncs.py +++ b/skdim/_commonfuncs.py @@ -94,12 +94,15 @@ def proxy(tup): return function(X, **Dict) -def get_nn(X, k, n_jobs=1): +def get_nn(X, k, n_jobs=1, sparse=False): """Compute the k-nearest neighbors of a dataset np.array (n_samples x n_dims)""" neigh = NearestNeighbors(n_neighbors=k, n_jobs=n_jobs) neigh.fit(X) - dists, inds = neigh.kneighbors(return_distance=True) - return dists, inds + if sparse: + return neigh.kneighbors_graph(mode='distance') + else: + dists, inds = neigh.kneighbors(return_distance=True) + return dists, inds def asPointwise(data, class_instance, precomputed_knn=None, n_neighbors=100, n_jobs=1): @@ -117,49 +120,7 @@ def asPointwise(data, class_instance, precomputed_knn=None, n_neighbors=100, n_j else: return np.array([class_instance.fit(data[i, :]).dimension_ for i in knn]) - -# class DocInheritorBase(type): -# """ A metaclass to append GlobalEstimator or LocalEstimator Attributes section docstring to each estimator""" -# -# def __new__(mcs, class_name, class_bases, class_dict): -# # inherit class docstring: the docstring is constructed by traversing -# # the mro for the class and merging their docstrings, with each next -# # docstring as serving as the 'parent', and the accumulated docstring -# # serving as the 'child' -# this_doc = class_dict.get("__doc__", None) -# for mro_cls in (mro_cls for base in class_bases for mro_cls in base.mro()): -# prnt_cls_doc = mro_cls.__doc__ -# if prnt_cls_doc is not None: -# if prnt_cls_doc == "The most base type": -# prnt_cls_doc = None -# this_doc = mcs.class_doc_inherit(prnt_cls_doc, this_doc) -# -# class_dict["__doc__"] = this_doc -# -# return type.__new__(mcs, class_name, class_bases, class_dict) -# -# @staticmethod -# def class_doc_inherit(prnt_doc, child_doc): -# """ Merge the docstrings of a parent class and its child. -# -# Parameters -# ---------- -# prnt_cls_doc: Union[None, str] -# child_doc: Union[None, str] -# """ -# if prnt_doc is None or "dimension_" not in prnt_doc: -# return child_doc -# else: -# if "Attributes" in child_doc: -# prnt_doc_attr = prnt_doc.index("dimension_") -# child_doc = child_doc + prnt_doc[prnt_doc_attr:] + "\n" -# else: -# prnt_doc_attr = prnt_doc.index("Attributes") -# child_doc = child_doc + "\n " + prnt_doc[prnt_doc_attr:] -# return child_doc - - -class GlobalEstimator(BaseEstimator): # , metaclass=DocInheritorBase): +class GlobalEstimator(BaseEstimator): """ Template base class: inherit BaseEstimator, define transform, fit_transform, fit_pw, transform_pw, fit_transform_pw Attributes diff --git a/skdim/cluster.py b/skdim/cluster.py new file mode 100644 index 0000000..0c1f7d1 --- /dev/null +++ b/skdim/cluster.py @@ -0,0 +1,82 @@ +import numpy as np +import inspect +from scipy.cluster.hierarchy import dendrogram +import sklearn.cluster +from sklearn.base import ClusterMixin, BaseEstimator + +def __dir__(): return ['getLinkage','dendrogram','AgglomerativeClustering','CutPursuit'] + +def getLinkage(model): + counts = np.zeros(model.children_.shape[0]) + n_samples = len(model.labels_) + for i, merge in enumerate(model.children_): + current_count = 0 + for child_idx in merge: + if child_idx < n_samples: + current_count += 1 # leaf node + else: + current_count += counts[child_idx - n_samples] + counts[i] = current_count + + linkage_matrix = np.column_stack( + [model.children_, model.distances_, counts] + ).astype(float) + return linkage_matrix + + +class AgglomerativeClustering(sklearn.cluster.AgglomerativeClustering): + def __init__(self, + n_clusters=None, + distance_threshold=None, + affinity="euclidean", + memory=None, + connectivity=None, + compute_full_tree='auto', + linkage='ward', + compute_distances=False): + + kwargs = inspect.getargvalues(inspect.currentframe())[-1] + kwargs.pop("self") + super().__init__() + #super().set_params(**{k:kwargs[k] for k in super().get_params()}) + for arg, val in kwargs.items(): + setattr(self, arg, val) + + def fit(self,lid): + + if len(lid.shape) == 1: + lid = lid.reshape(-1,1) + if not(self.affinity in ['euclidean', 'l1', 'l2']): + raise ValueError("affinity must be one of 'euclidean', 'l1', 'l2'") + if self.connectivity is None: + print("WARNING: connectivity matrix is None."+ + " Clustering will not take spatial relationships into account") + if (self.n_clusters is None) and (self.distance_threshold is None): + self.distance_threshold = np.around(np.std(lid),2) + print(f"n_clusters and distance_threshold are both None."+ + " Setting distance_threshold as std(lid) =",round(self.distance_threshold,2)) + + super().fit(lid) + + self.comp_mean_lid = np.array([np.mean(lid[self.labels_==c]) for c in np.unique(self.labels_)]) + return self + +class CutPursuit(ClusterMixin,BaseEstimator): + #def __new__(cls): + # try: + # cls.cutpursuit = __import__('cutpursuit') + # return super(CutPursuit,cls).__new__(cls) + # except ModuleNotFoundError as err: + # print(f'{err}, install with: \n', + # 'conda install gxx-compiler -y \n', + # 'pip install git+https://github.com/j-bac/parallel-cut-pursuit') + # raise + + def __init__(self): + try: + self.cutpursuit = __import__('cutpursuit') + except ModuleNotFoundError as err: + print(f'To use this class, install cutpursuit with: \n', + 'conda install gxx-compiler -y \n', + 'pip install git+https://github.com/j-bac/parallel-cut-pursuit') + raise diff --git a/skdim/datasets.py b/skdim/datasets.py index 70f0c22..bc622ff 100644 --- a/skdim/datasets.py +++ b/skdim/datasets.py @@ -34,6 +34,7 @@ from sklearn.utils.validation import check_random_state from scipy.special import gammainc +def __dir__(): return ['BenchmarkManifolds','hyperBall', 'hyperSphere', 'hyperTwinPeaks', 'lineDiskBall', 'swissRoll3Sph'] def hyperBall(n, d, radius=1.0, center=[], random_state=None): """ diff --git a/skdim/id/_FisherS.py b/skdim/id/_FisherS.py index 8fdd2e2..8b01b8f 100644 --- a/skdim/id/_FisherS.py +++ b/skdim/id/_FisherS.py @@ -1,703 +1,703 @@ -# -# BSD 3-Clause License -# -# Copyright (c) 2020, Jonathan Bac -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -from sklearn.utils.validation import check_array -import numba as nb -import numpy as np -import sklearn.decomposition as sk -from scipy.special import lambertw -from matplotlib import pyplot as plt -from .._commonfuncs import GlobalEstimator -import warnings - -warnings.filterwarnings("ignore") - - -class FisherS(GlobalEstimator): - """Intrinsic dimension estimation using the Fisher Separability algorithm. [Albergante2019]_ - - Parameters - ---------- - conditional_number: float, default=10 - A positive real value used to select the top principal components. We consider only PCs with eigen values - which are not less than the maximal eigenvalue divided by conditional_number - project_on_sphere: bool, default=True - A boolean value indicating if projecting on a sphere should be performed. - test_alphas: 2D np.array with dtype float - A row vector of floats, with alpha range, the values must be given increasing - within (0,1) interval. Default is np.arange(.6,1,.02)[None]. - produce_plots: bool, default=False - A boolean value indicating if the standard plots need to be drawn. - verbose: bool - Whether to print the number of retained principal components - limit_maxdim: bool - Whether to cap estimated maxdim to the embedding dimension - """ - - def __init__( - self, - conditional_number=10, - project_on_sphere=1, - alphas=None, - produce_plots=False, - verbose=0, - limit_maxdim=False, - ): - - self.conditional_number = conditional_number - self.project_on_sphere = project_on_sphere - self.alphas = alphas - self.produce_plots = produce_plots - self.verbose = verbose - self.limit_maxdim = limit_maxdim - - def fit(self, X, y=None): - """ - A reference implementation of a fitting function. - - Parameters - ---------- - X : {array-like}, shape (n_samples, n_features) - The training input samples. - y : dummy parameter to respect the sklearn API - - Returns - ------- - self : object - Returns self. - self.dimension_: float - The estimated intrinsic dimension - self.n_alpha : 1D np.array, float - Effective dimension profile as a function of alpha - self.n_single : float - A single estimate for the effective dimension - self.p_alpha : 2D np.array, float - Distributions as a function of alpha, matrix with columns corresponding to the alpha values, and with rows corresponding to objects. - self.separable_fraction : 1D np.array, float - Separable fraction of data points as a function of alpha - self.alphas : 2D np.array, float - Input alpha values - """ - - X = check_array(X, ensure_min_samples=2, ensure_min_features=2) - - # test_alphas introduced to pass sklearn checks (sklearn doesn't take arrays as default parameters) - if self.alphas is None: - self._alphas = np.arange(0.6, 1, 0.02)[None] - else: - self._alphas = self.alphas - - ( - self.n_alpha_, - self.dimension_, - self.p_alpha_, - self.alphas_, - self.separable_fraction_, - self.Xp_, - ) = self._SeparabilityAnalysis(X) - - self.is_fitted_ = True - # `fit` should always return `self` - return self - - @staticmethod - @nb.njit - def _histc(X, bins): - map_to_bins = np.digitize(X, bins) - r = np.zeros((len(X[0, :]), len(bins))) - for j in range(len(map_to_bins[0, :])): - for i in map_to_bins[:, j]: - r[j, i - 1] += 1 - return r - - def _preprocessing(self, X, center, dimred, whiten): - """ - Preprocessing of the dataset - - Inputs - X is n-by-d data matrix with n d-dimensional datapoints. - center is boolean. True means subtraction of mean vector. - dimred is boolean. True means applying of dimensionality reduction with - PCA. Number of used PCs is defined by conditional_number argument. - whiten is boolean. True means applying of whitenning. True whiten - automatically caused true dimred. - project_on_sphere is boolean. True means projecting data onto unit sphere - - Outputs - X is preprocessed data matrix. - """ - - # centering - sampleMean = np.mean(X, axis=0) - if center: - X = X - sampleMean - # dimensionality reduction if requested dimensionality reduction or whitening - if dimred or whiten: - pca = sk.PCA() - u = pca.fit_transform(X) - v = pca.components_.T - s = pca.explained_variance_ - sc = s / s[0] - ind = np.where(sc > 1 / self.conditional_number)[0] - X = X @ v[:, ind] - if self.verbose: - print( - "%i components are retained using conditional_number=%2.2f" - % (len(ind), self.conditional_number) - ) - - # whitening - if whiten: - X = u[:, ind] - st = np.std(X, axis=0, ddof=1) - X = X / st - # #project on sphere (scale each vector to unit length) - if self.project_on_sphere: - st = np.sqrt(np.sum(X ** 2, axis=1)) - st = np.array([st]).T - X = X / st - - return X - - @staticmethod - def _probability_inseparable_sphere(alphas, n): - """ - %probability_inseparable_sphere calculate theoretical probability for point - %to be inseparable for dimension n - % - %Inputs: - % alphas is 1-by-d vector of possible alphas. Must be row vector or scalar - % n is c-by-1 vector of dimnesions. Must be column vector or scalar. - % - %Outputs: - % p is c-by-d matrix of probabilities.""" - p = np.power((1 - np.power(alphas, 2)), (n - 1) / 2) / ( - alphas * np.sqrt(2 * np.pi * n) - ) - return p - - # def _checkSeparability(self, xy): - # dxy = np.diag(xy) - # sm = (xy/dxy).T - # sm = sm - np.diag(np.diag(sm)) - # sm = sm > self._alphas - # py = sum(sm.T) - # py = py/len(py[0, :]) - # separ_fraction = sum(py == 0)/len(py[0, :]) - # - # return separ_fraction, py - - def _checkSeparabilityMultipleAlpha(self, data): - """%checkSeparabilityMultipleAlpha calculate fraction of points inseparable - %for each alpha and fraction of points which are inseparable from each - %point for different alpha. - % - %Inputs: - % data is data matrix to calculate separability. Each row contains one - % data point. - % alphas is array of alphas to test separability. - % - %Outputs: - % separ_fraction fraction of points inseparable from at least one point. - % Fraction is calculated for each alpha. - % py is n-by-m matrix. py(i,j) is fraction of points which are - % inseparable from point data(i, :) for alphas(j).""" - - # Number of points per 1 loop. 20k assumes approx 3.2GB - nP = 2000 - - alphas = self._alphas - # Normalize alphas - if len(alphas[:, 0]) > 1: - alphas = alphas.T - addedone = 0 - if max(self._alphas[0, :]) < 1: - alphas = np.array([np.append(alphas, 1)]) - addedone = 1 - - alphas = np.concatenate([[float("-inf")], alphas[0, :], [float("inf")]]) - - n = len(data) - counts = np.zeros((n, len(alphas))) - leng = np.zeros((n, 1)) - for k in range(0, n, nP): - # print('Chunk +{}'.format(k)) - e = k + nP - if e > n: - e = n - # Calculate diagonal part, divide each row by diagonal element - xy = data[k:e, :] @ data[k:e, :].T - leng[k:e] = np.diag(xy)[:, None] - xy = xy - np.diag(leng[k:e].squeeze()) - xy = xy / leng[k:e] - counts[k:e, :] = counts[k:e, :] + self._histc(xy.T, alphas) - # Calculate nondiagonal part - for kk in range(0, n, nP): - # Ignore diagonal part - if k == kk: - continue - ee = kk + nP - if ee > n: - ee = n - - xy = data[k:e, :] @ data[kk:ee, :].T - xy = xy / leng[k:e] - counts[k:e, :] = counts[k:e, :] + self._histc(xy.T, alphas) - - # Calculate cumulative sum - counts = np.cumsum(counts[:, ::-1], axis=1)[:, ::-1] - - # print(counts) - - py = counts / (n) - py = py.T - if addedone: - py = py[1:-2, :] - else: - py = py[1:-1, :] - - separ_fraction = sum(py == 0) / len(py[0, :]) - - return separ_fraction, py - - def _dimension_uniform_sphere(self, py): - """ - %Gives an estimation of the dimension of uniformly sampled n-sphere - %corresponding to the average probability of being inseparable and a margin - %value - % - %Inputs: - % py - average fraction of data points which are INseparable. - % alphas - set of values (margins), must be in the range (0;1) - % It is assumed that the length of py and alpha vectors must be of the - % same. - % - %Outputs: - % n - effective dimension profile as a function of alpha - % n_single_estimate - a single estimate for the effective dimension - % alfa_single_estimate is alpha for n_single_estimate. - """ - - if len(py) != len(self._alphas[0, :]): - raise ValueError( - "length of py (%i) and alpha (%i) does not match" - % (len(py), len(self._alphas[0, :])) - ) - - if np.sum(self._alphas <= 0) > 0 or np.sum(self._alphas >= 1) > 0: - raise ValueError( - [ - '"Alphas" must be a real vector, with alpha range, the values must be within (0,1) interval' - ] - ) - - # Calculate dimension for each alpha - n = np.zeros((len(self._alphas[0, :]))) - for i in range(len(self._alphas[0, :])): - if py[i] == 0: - # All points are separable. Nothing to do and not interesting - n[i] = np.nan - else: - p = py[i] - a2 = self._alphas[0, i] ** 2 - w = np.log(1 - a2) - n[i] = np.real(lambertw(-(w / (2 * np.pi * p * p * a2 * (1 - a2))))) / ( - -w - ) - - n[n == np.inf] = float("nan") - # Find indices of alphas which are not completely separable - inds = np.where(~np.isnan(n))[0] - if len(inds) == 0: - warnings.warn("All points are fully separable for any of the chosen alphas") - return n, np.array([np.nan]), np.nan - - # Find the maximal value of such alpha - alpha_max = max(self._alphas[0, inds]) - # The reference alpha is the closest to 90 of maximal partially separable alpha - alpha_ref = alpha_max * 0.9 - k = np.where( - abs(self._alphas[0, inds] - alpha_ref) - == min(abs(self._alphas[0, :] - alpha_ref)) - )[0] - # Get corresponding values - alfa_single_estimate = self._alphas[0, inds[k]] - n_single_estimate = n[inds[k]] - - return n, n_single_estimate, alfa_single_estimate - - # def _dimension_uniform_sphere_robust(self, py): - # '''modification to return selected index and handle the case where all values are 0''' - # if len(py) != len(self._alphas[0, :]): - # raise ValueError('length of py (%i) and alpha (%i) does not match' % ( - # len(py), len(self._alphas[0, :]))) - # - # if np.sum(self._alphas <= 0) > 0 or np.sum(self._alphas >= 1) > 0: - # raise ValueError( - # ['"Alphas" must be a real vector, with alpha range, the values must be within (0,1) interval']) - # - # # Calculate dimension for each alpha - # n = np.zeros((len(self._alphas[0, :]))) - # for i in range(len(self._alphas[0, :])): - # if py[i] == 0: - # # All points are separable. Nothing to do and not interesting - # n[i] = np.nan - # else: - # p = py[i] - # a2 = self._alphas[0, i]**2 - # w = np.log(1-a2) - # n[i] = lambertw(-(w/(2*np.pi*p*p*a2*(1-a2))))/(-w) - # - # n[n == np.inf] = float('nan') - # # Find indices of alphas which are not completely separable - # inds = np.where(~np.isnan(n))[0] - # if inds.size == 0: - # n_single_estimate = np.nan - # alfa_single_estimate = np.nan - # return n, n_single_estimate, alfa_single_estimate - # else: - # # Find the maximal value of such alpha - # alpha_max = max(self._alphas[0, inds]) - # # The reference alpha is the closest to 90 of maximal partially separable alpha - # alpha_ref = alpha_max*0.9 - # k = np.where(abs(self._alphas[0, inds]-alpha_ref) - # == min(abs(self._alphas[0, :]-alpha_ref)))[0] - # # Get corresponding values - # alfa_single_estimate = self._alphas[0, inds[k]] - # n_single_estimate = n[inds[k]] - # - # return n, n_single_estimate, alfa_single_estimate, inds[k] - - def point_inseparability_to_pointID( - self, idx="all_inseparable", force_definite_dim=True, verbose=True - ): - """ - Turn pointwise inseparability probability into pointwise global ID - Inputs : - args : same as SeparabilityAnalysis - kwargs : - idx : int, string - int for custom alpha index - 'all_inseparable' to choose alpha where lal points have non-zero inseparability probability - 'selected' to keep global alpha selected - force_definite_dim : bool - whether to force fully separable points to take the minimum detectable inseparability value (1/(n-1)) (i.e., maximal detectable dimension) - """ - if idx == "all_inseparable": # all points are inseparable - selected_idx = np.argwhere(np.all(self.p_alpha_ != 0, axis=1)).max() - elif idx == "selected": # globally selected alpha - selected_idx = (self.n_alpha_ == self.dimension_).tolist().index(True) - elif type(idx) == int: - selected_idx = idx - else: - raise ValueError("unknown idx parameter") - - # select palpha and corresponding alpha - palpha_selected = self.p_alpha_[selected_idx, :] - alpha_selected = self._alphas[0, selected_idx] - - py = palpha_selected.copy() - _alphas = np.repeat(alpha_selected, len(palpha_selected))[None] - - if force_definite_dim: - py[py == 0] = 1 / len(py) - - if len(py) != len(_alphas[0, :]): - raise ValueError( - "length of py (%i) and alpha (%i) does not match" - % (len(py), len(_alphas[0, :])) - ) - - if np.sum(_alphas <= 0) > 0 or np.sum(_alphas >= 1) > 0: - raise ValueError( - [ - '"Alphas" must be a real vector, with alpha range, the values must be within (0,1) interval' - ] - ) - - # Calculate dimension for each alpha - n = np.zeros((len(_alphas[0, :]))) - for i in range(len(_alphas[0, :])): - if py[i] == 0: - # All points are separable. Nothing to do and not interesting - n[i] = np.nan - else: - p = py[i] - a2 = _alphas[0, i] ** 2 - w = np.log(1 - a2) - n[i] = np.real(lambertw(-(w / (2 * np.pi * p * p * a2 * (1 - a2))))) / ( - -w - ) - - n[n == np.inf] = float("nan") - - # Find indices of alphas which are not completely separable - inds = np.where(~np.isnan(n))[0] - if self.verbose: - print( - str(len(inds)) + "/" + str(len(py)), - "points have nonzero inseparability probability for chosen alpha = " - + str(round(alpha_selected, 2)) - + f", force_definite_dim = {force_definite_dim}", - ) - return n, inds - - def getSeparabilityGraph(self, idx="all_inseparable", top_edges=10000): - data = self.Xp_ - if idx == "all_inseparable": # all points are inseparable - selected_idx = np.argwhere(np.all(self.p_alpha_ != 0, axis=1)).max() - elif idx == "selected": # globally selected alpha - selected_idx = (self.n_alpha_ == self.dimension_).tolist().index(True) - elif type(idx) == int: - selected_idx = idx - else: - raise ValueError("unknown idx parameter") - alpha_selected = self._alphas[0, selected_idx] - return self.buildSeparabilityGraph(data, alpha_selected, top_edges=top_edges) - - @staticmethod - def plotSeparabilityGraph(x, y, edges, alpha=0.3): - for i in range(len(edges)): - ii = edges[i][0] - jj = edges[i][1] - plt.plot([x[ii], x[jj]], [y[ii], y[jj]], "k-", alpha=alpha) - - @staticmethod - def buildSeparabilityGraph(data, alpha, top_edges=10000): - """weighted directed separability graph, represented by a list of tuples (point i, point j) and an array of weights - each tuple is the observation that point i is inseparable from j, the weight is /-alpha - - data is a matrix of data which is assumed to be properly normalized - alpha parameter is a signle value in this case - top_edges is the number of edges to return. if top_edges is negative then all edges will be returned - """ - - # Number of points per 1 loop. 20k assumes approx 3.2GB - nP = np.min([2000, len(data)]) - n = len(data) - leng = np.zeros((n, 1)) - - # globalxy = np.zeros((n,n)) - - insep_edges = [] - weights = [] - symmetric_graph = True - symmetry_message = False - - for k in range(0, n, nP): - e = k + nP - if e > n: - e = n - # Calculate diagonal part, divide each row by diagonal element - xy = data[k:e, :] @ data[k:e, :].T - leng[k:e] = np.diag(xy)[:, None] - xy = xy - np.diag(leng[k:e].squeeze()) - xy = xy / leng[k:e] - # if skdim.lid.FisherS.check_symmetric(xy): - if np.allclose(xy, xy.T, rtol=1e-05, atol=1e-08): - # globalxy[k:e,k:e] = np.triu(xy) - for i in range(len(xy)): - for j in range(i + 1, len(xy)): - if xy[i, j] > alpha: - insep_edges.append((k + i, k + j)) - weights.append(xy[i, j] - alpha) - else: - symmetric_graph = False - # globalxy[k:e,k:e] = xy - for i in range(len(xy)): - for j in range(i + 1, len(xy)): - if xy[i, j] > alpha: - insep_edges.append((k + i, k + j)) - weights.append(xy[i, j] - alpha) - # Calculate nondiagonal part - startpoint = 0 - if symmetric_graph: - startpoint = k - if not symmetry_message: - print( - "Graph is symmetric, only upper triangle of the separability matrix will be used" - ) - symmetry_message = True - for kk in range(startpoint, n, nP): - # Ignore diagonal part - if not k == kk: - ee = kk + nP - if ee > n: - ee = n - xy = data[k:e, :] @ data[kk:ee, :].T - xy = xy / leng[k:e] - # globalxy[k:e,kk:ee] = xy - for i in range(ee - kk): - for j in range(ee - kk): - if xy[i, j] > alpha: - insep_edges.append((k + i, kk + j)) - weights.append(xy[i, j] - alpha) - - weights = np.array(weights) - - if top_edges > 0: - if top_edges < len(insep_edges): - weights_sorted = np.sort(weights) - weights_sorted = weights_sorted[::-1] - thresh = weights_sorted[top_edges] - insep_edges_filtered = [] - weights_filtered = [] - for i, w in enumerate(weights): - if w > thresh: - insep_edges_filtered.append(insep_edges[i]) - weights_filtered.append(w) - weights = np.array(weights_filtered) - insep_edges = insep_edges_filtered - - return insep_edges, weights - - @staticmethod - def check_symmetric(a, rtol=1e-05, atol=1e-08): - return np.allclose(a, a.T, rtol=rtol, atol=atol) - - def _SeparabilityAnalysis(self, X): - """ - %Performs standard analysis of separability and produces standard plots. - % - %Inputs: - % X - is a data matrix with one data point in each row. - % Optional arguments in varargin form Name, Value pairs. Possible names: - % 'conditional_number' - a positive real value used to select the top - % princinpal components. We consider only PCs with eigen values - % which are not less than the maximal eigenvalue divided by - % conditional_number Default value is 10. - % 'project_on_sphere' - a boolean value indicating if projecting on a - % sphere should be performed. Default value is true. - % 'Alphas' - a real vector, with alpha range, the values must be given increasing - % within (0,1) interval. Default is [0.6,0.62,...,0.98]. - % 'produce_plots' - a boolean value indicating if the standard plots - % need to be drawn. Default is true. - % 'verbose' - bool, whether to print number of retained principal components - % 'limit_maxdim' bool, whether to cap estimated maxdim to the embedding dimension - %Outputs: - % n_alpha - effective dimension profile as a function of alpha - % n_single - a single estimate for the effective dimension - % p_alpha - distributions as a function of alpha, matrix with columns - % corresponding to the alpha values, and with rows corresponding to - % objects. - % separable_fraction - separable fraction of data points as a function of - % alpha - % alphas - alpha values - """ - npoints = len(X[:, 0]) - # Preprocess data - Xp = self._preprocessing(X, 1, 1, 1) - # Check separability - separable_fraction, p_alpha = self._checkSeparabilityMultipleAlpha(Xp) - # Calculate mean fraction of separable points for each alpha. - py_mean = np.mean(p_alpha, axis=1) - n_alpha, n_single, alpha_single = self._dimension_uniform_sphere(py_mean) - - alpha_ind_selected = np.where(n_single == n_alpha)[0] - - if self.limit_maxdim: - n_single = np.clip(n_single, None, X.shape[1]) - - if self.produce_plots: - # Define the minimal and maximal dimensions for theoretical graph with - # two dimensions in each side - n_min = np.floor(min(n_alpha)) - 2 - n_max = np.floor(max(n_alpha) + 0.8) + 2 - if n_min < 1: - n_min = 1 - - ns = np.arange(n_min, n_max + 1) - - plt.figure() - plt.plot(self._alphas[0, :], n_alpha, "ko-") - plt.plot( - self._alphas[0, alpha_ind_selected], n_single, "rx", markersize=16, - ) - plt.xlabel("\u03B1", fontsize=16) - plt.ylabel("Effective dimension", fontsize=16) - locs, labels = plt.xticks() - plt.show() - nbins = int(round(np.floor(npoints / 200))) - - if nbins < 20: - nbins = 20 - - plt.figure() - plt.hist(p_alpha[alpha_ind_selected, :][0], bins=nbins) - plt.xlabel( - "Inseparability prob. for \u03B1=%2.2f" - % (self._alphas[0, alpha_ind_selected]), - fontsize=16, - ) - plt.ylabel("Number of values") - plt.show() - - plt.figure() - plt.xticks(locs, labels) - pteor = np.zeros((len(ns), len(self._alphas[0, :]))) - for k in range(len(ns)): - for j in range(len(self._alphas[0, :])): - pteor[k, j] = self._probability_inseparable_sphere( - self._alphas[0, j], ns[k] - ) - - for i in range(len(pteor[:, 0])): - plt.semilogy(self._alphas[0, :], pteor[i, :], "-", color="r") - plt.xlim(min(self._alphas[0, :]), 1) - if True in np.isnan(n_alpha): - plt.semilogy( - self._alphas[0, : np.where(np.isnan(n_alpha))[0][0]], - py_mean[: np.where(np.isnan(n_alpha))[0][0]], - "bo-", - "LineWidth", - 3, - ) - else: - plt.semilogy(self._alphas[0, :], py_mean, "bo-", "LineWidth", 3) - - plt.xlabel("\u03B1") - plt.ylabel("Mean inseparability prob.", fontsize=16) - plt.title("Theor.curves for n=%i:%i" % (n_min, n_max)) - plt.show() - - if len(n_single) > 1: - print( - "FisherS selected several dimensions as equally probable. Taking the maximum" - ) - - return ( - n_alpha, - n_single.max(), - p_alpha, - self._alphas, - separable_fraction, - Xp, - ) +# +# BSD 3-Clause License +# +# Copyright (c) 2020, Jonathan Bac +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +from sklearn.utils.validation import check_array +import numba as nb +import numpy as np +import sklearn.decomposition as sk +from scipy.special import lambertw +from matplotlib import pyplot as plt +from .._commonfuncs import GlobalEstimator +import warnings + +warnings.filterwarnings("ignore") + + +class FisherS(GlobalEstimator): + """Intrinsic dimension estimation using the Fisher Separability algorithm. [Albergante2019]_ + + Parameters + ---------- + conditional_number: float, default=10 + A positive real value used to select the top principal components. We consider only PCs with eigen values + which are not less than the maximal eigenvalue divided by conditional_number + project_on_sphere: bool, default=True + A boolean value indicating if projecting on a sphere should be performed. + test_alphas: 2D np.array with dtype float + A row vector of floats, with alpha range, the values must be given increasing + within (0,1) interval. Default is np.arange(.6,1,.02)[None]. + produce_plots: bool, default=False + A boolean value indicating if the standard plots need to be drawn. + verbose: bool + Whether to print the number of retained principal components + limit_maxdim: bool + Whether to cap estimated maxdim to the embedding dimension + """ + + def __init__( + self, + conditional_number=10, + project_on_sphere=1, + alphas=None, + produce_plots=False, + verbose=0, + limit_maxdim=False, + ): + + self.conditional_number = conditional_number + self.project_on_sphere = project_on_sphere + self.alphas = alphas + self.produce_plots = produce_plots + self.verbose = verbose + self.limit_maxdim = limit_maxdim + + def fit(self, X, y=None): + """ + A reference implementation of a fitting function. + + Parameters + ---------- + X : {array-like}, shape (n_samples, n_features) + The training input samples. + y : dummy parameter to respect the sklearn API + + Returns + ------- + self : object + Returns self. + self.dimension_: float + The estimated intrinsic dimension + self.n_alpha : 1D np.array, float + Effective dimension profile as a function of alpha + self.n_single : float + A single estimate for the effective dimension + self.p_alpha : 2D np.array, float + Distributions as a function of alpha, matrix with columns corresponding to the alpha values, and with rows corresponding to objects. + self.separable_fraction : 1D np.array, float + Separable fraction of data points as a function of alpha + self.alphas : 2D np.array, float + Input alpha values + """ + + X = check_array(X, ensure_min_samples=2, ensure_min_features=2) + + # test_alphas introduced to pass sklearn checks (sklearn doesn't take arrays as default parameters) + if self.alphas is None: + self._alphas = np.arange(0.6, 1, 0.02)[None] + else: + self._alphas = self.alphas + + ( + self.n_alpha_, + self.dimension_, + self.p_alpha_, + self.alphas_, + self.separable_fraction_, + self.Xp_, + ) = self._SeparabilityAnalysis(X) + + self.is_fitted_ = True + # `fit` should always return `self` + return self + + @staticmethod + @nb.njit + def _histc(X, bins): + map_to_bins = np.digitize(X, bins) + r = np.zeros((len(X[0, :]), len(bins))) + for j in range(len(map_to_bins[0, :])): + for i in map_to_bins[:, j]: + r[j, i - 1] += 1 + return r + + def _preprocessing(self, X, center, dimred, whiten): + """ + Preprocessing of the dataset + + Inputs + X is n-by-d data matrix with n d-dimensional datapoints. + center is boolean. True means subtraction of mean vector. + dimred is boolean. True means applying of dimensionality reduction with + PCA. Number of used PCs is defined by conditional_number argument. + whiten is boolean. True means applying of whitenning. True whiten + automatically caused true dimred. + project_on_sphere is boolean. True means projecting data onto unit sphere + + Outputs + X is preprocessed data matrix. + """ + + # centering + sampleMean = np.mean(X, axis=0) + if center: + X = X - sampleMean + # dimensionality reduction if requested dimensionality reduction or whitening + if dimred or whiten: + pca = sk.PCA() + u = pca.fit_transform(X) + v = pca.components_.T + s = pca.explained_variance_ + sc = s / s[0] + ind = np.where(sc > 1 / self.conditional_number)[0] + X = X @ v[:, ind] + if self.verbose: + print( + "%i components are retained using conditional_number=%2.2f" + % (len(ind), self.conditional_number) + ) + + # whitening + if whiten: + X = u[:, ind] + st = np.std(X, axis=0, ddof=1) + X = X / st + # #project on sphere (scale each vector to unit length) + if self.project_on_sphere: + st = np.sqrt(np.sum(X ** 2, axis=1)) + st = np.array([st]).T + X = X / st + + return X + + @staticmethod + def _probability_inseparable_sphere(alphas, n): + """ + %probability_inseparable_sphere calculate theoretical probability for point + %to be inseparable for dimension n + % + %Inputs: + % alphas is 1-by-d vector of possible alphas. Must be row vector or scalar + % n is c-by-1 vector of dimnesions. Must be column vector or scalar. + % + %Outputs: + % p is c-by-d matrix of probabilities.""" + p = np.power((1 - np.power(alphas, 2)), (n - 1) / 2) / ( + alphas * np.sqrt(2 * np.pi * n) + ) + return p + + # def _checkSeparability(self, xy): + # dxy = np.diag(xy) + # sm = (xy/dxy).T + # sm = sm - np.diag(np.diag(sm)) + # sm = sm > self._alphas + # py = sum(sm.T) + # py = py/len(py[0, :]) + # separ_fraction = sum(py == 0)/len(py[0, :]) + # + # return separ_fraction, py + + def _checkSeparabilityMultipleAlpha(self, data): + """%checkSeparabilityMultipleAlpha calculate fraction of points inseparable + %for each alpha and fraction of points which are inseparable from each + %point for different alpha. + % + %Inputs: + % data is data matrix to calculate separability. Each row contains one + % data point. + % alphas is array of alphas to test separability. + % + %Outputs: + % separ_fraction fraction of points inseparable from at least one point. + % Fraction is calculated for each alpha. + % py is n-by-m matrix. py(i,j) is fraction of points which are + % inseparable from point data(i, :) for alphas(j).""" + + # Number of points per 1 loop. 20k assumes approx 3.2GB + nP = 2000 + + alphas = self._alphas + # Normalize alphas + if len(alphas[:, 0]) > 1: + alphas = alphas.T + addedone = 0 + if max(self._alphas[0, :]) < 1: + alphas = np.array([np.append(alphas, 1)]) + addedone = 1 + + alphas = np.concatenate([[float("-inf")], alphas[0, :], [float("inf")]]) + + n = len(data) + counts = np.zeros((n, len(alphas))) + leng = np.zeros((n, 1)) + for k in range(0, n, nP): + # print('Chunk +{}'.format(k)) + e = k + nP + if e > n: + e = n + # Calculate diagonal part, divide each row by diagonal element + xy = data[k:e, :] @ data[k:e, :].T + leng[k:e] = np.diag(xy)[:, None] + xy = xy - np.diag(leng[k:e].squeeze()) + xy = xy / leng[k:e] + counts[k:e, :] = counts[k:e, :] + self._histc(xy.T, alphas) + # Calculate nondiagonal part + for kk in range(0, n, nP): + # Ignore diagonal part + if k == kk: + continue + ee = kk + nP + if ee > n: + ee = n + + xy = data[k:e, :] @ data[kk:ee, :].T + xy = xy / leng[k:e] + counts[k:e, :] = counts[k:e, :] + self._histc(xy.T, alphas) + + # Calculate cumulative sum + counts = np.cumsum(counts[:, ::-1], axis=1)[:, ::-1] + + # print(counts) + + py = counts / (n) + py = py.T + if addedone: + py = py[1:-2, :] + else: + py = py[1:-1, :] + + separ_fraction = sum(py == 0) / len(py[0, :]) + + return separ_fraction, py + + def _dimension_uniform_sphere(self, py): + """ + %Gives an estimation of the dimension of uniformly sampled n-sphere + %corresponding to the average probability of being inseparable and a margin + %value + % + %Inputs: + % py - average fraction of data points which are INseparable. + % alphas - set of values (margins), must be in the range (0;1) + % It is assumed that the length of py and alpha vectors must be of the + % same. + % + %Outputs: + % n - effective dimension profile as a function of alpha + % n_single_estimate - a single estimate for the effective dimension + % alfa_single_estimate is alpha for n_single_estimate. + """ + + if len(py) != len(self._alphas[0, :]): + raise ValueError( + "length of py (%i) and alpha (%i) does not match" + % (len(py), len(self._alphas[0, :])) + ) + + if np.sum(self._alphas <= 0) > 0 or np.sum(self._alphas >= 1) > 0: + raise ValueError( + [ + '"Alphas" must be a real vector, with alpha range, the values must be within (0,1) interval' + ] + ) + + # Calculate dimension for each alpha + n = np.zeros((len(self._alphas[0, :]))) + for i in range(len(self._alphas[0, :])): + if py[i] == 0: + # All points are separable. Nothing to do and not interesting + n[i] = np.nan + else: + p = py[i] + a2 = self._alphas[0, i] ** 2 + w = np.log(1 - a2) + n[i] = np.real(lambertw(-(w / (2 * np.pi * p * p * a2 * (1 - a2))))) / ( + -w + ) + + n[n == np.inf] = float("nan") + # Find indices of alphas which are not completely separable + inds = np.where(~np.isnan(n))[0] + if len(inds) == 0: + warnings.warn("All points are fully separable for any of the chosen alphas") + return n, np.array([np.nan]), np.nan + + # Find the maximal value of such alpha + alpha_max = max(self._alphas[0, inds]) + # The reference alpha is the closest to 90 of maximal partially separable alpha + alpha_ref = alpha_max * 0.9 + k = np.where( + abs(self._alphas[0, inds] - alpha_ref) + == min(abs(self._alphas[0, :] - alpha_ref)) + )[0] + # Get corresponding values + alfa_single_estimate = self._alphas[0, inds[k]] + n_single_estimate = n[inds[k]] + + return n, n_single_estimate, alfa_single_estimate + + # def _dimension_uniform_sphere_robust(self, py): + # '''modification to return selected index and handle the case where all values are 0''' + # if len(py) != len(self._alphas[0, :]): + # raise ValueError('length of py (%i) and alpha (%i) does not match' % ( + # len(py), len(self._alphas[0, :]))) + # + # if np.sum(self._alphas <= 0) > 0 or np.sum(self._alphas >= 1) > 0: + # raise ValueError( + # ['"Alphas" must be a real vector, with alpha range, the values must be within (0,1) interval']) + # + # # Calculate dimension for each alpha + # n = np.zeros((len(self._alphas[0, :]))) + # for i in range(len(self._alphas[0, :])): + # if py[i] == 0: + # # All points are separable. Nothing to do and not interesting + # n[i] = np.nan + # else: + # p = py[i] + # a2 = self._alphas[0, i]**2 + # w = np.log(1-a2) + # n[i] = lambertw(-(w/(2*np.pi*p*p*a2*(1-a2))))/(-w) + # + # n[n == np.inf] = float('nan') + # # Find indices of alphas which are not completely separable + # inds = np.where(~np.isnan(n))[0] + # if inds.size == 0: + # n_single_estimate = np.nan + # alfa_single_estimate = np.nan + # return n, n_single_estimate, alfa_single_estimate + # else: + # # Find the maximal value of such alpha + # alpha_max = max(self._alphas[0, inds]) + # # The reference alpha is the closest to 90 of maximal partially separable alpha + # alpha_ref = alpha_max*0.9 + # k = np.where(abs(self._alphas[0, inds]-alpha_ref) + # == min(abs(self._alphas[0, :]-alpha_ref)))[0] + # # Get corresponding values + # alfa_single_estimate = self._alphas[0, inds[k]] + # n_single_estimate = n[inds[k]] + # + # return n, n_single_estimate, alfa_single_estimate, inds[k] + + def point_inseparability_to_pointID( + self, idx="all_inseparable", force_definite_dim=True, verbose=True + ): + """ + Turn pointwise inseparability probability into pointwise global ID + Inputs : + args : same as SeparabilityAnalysis + kwargs : + idx : int, string + int for custom alpha index + 'all_inseparable' to choose alpha where lal points have non-zero inseparability probability + 'selected' to keep global alpha selected + force_definite_dim : bool + whether to force fully separable points to take the minimum detectable inseparability value (1/(n-1)) (i.e., maximal detectable dimension) + """ + if idx == "all_inseparable": # all points are inseparable + selected_idx = np.argwhere(np.all(self.p_alpha_ != 0, axis=1)).max() + elif idx == "selected": # globally selected alpha + selected_idx = (self.n_alpha_ == self.dimension_).tolist().index(True) + elif type(idx) == int: + selected_idx = idx + else: + raise ValueError("unknown idx parameter") + + # select palpha and corresponding alpha + palpha_selected = self.p_alpha_[selected_idx, :] + alpha_selected = self._alphas[0, selected_idx] + + py = palpha_selected.copy() + _alphas = np.repeat(alpha_selected, len(palpha_selected))[None] + + if force_definite_dim: + py[py == 0] = 1 / len(py) + + if len(py) != len(_alphas[0, :]): + raise ValueError( + "length of py (%i) and alpha (%i) does not match" + % (len(py), len(_alphas[0, :])) + ) + + if np.sum(_alphas <= 0) > 0 or np.sum(_alphas >= 1) > 0: + raise ValueError( + [ + '"Alphas" must be a real vector, with alpha range, the values must be within (0,1) interval' + ] + ) + + # Calculate dimension for each alpha + n = np.zeros((len(_alphas[0, :]))) + for i in range(len(_alphas[0, :])): + if py[i] == 0: + # All points are separable. Nothing to do and not interesting + n[i] = np.nan + else: + p = py[i] + a2 = _alphas[0, i] ** 2 + w = np.log(1 - a2) + n[i] = np.real(lambertw(-(w / (2 * np.pi * p * p * a2 * (1 - a2))))) / ( + -w + ) + + n[n == np.inf] = float("nan") + + # Find indices of alphas which are not completely separable + inds = np.where(~np.isnan(n))[0] + if self.verbose: + print( + str(len(inds)) + "/" + str(len(py)), + "points have nonzero inseparability probability for chosen alpha = " + + str(round(alpha_selected, 2)) + + f", force_definite_dim = {force_definite_dim}", + ) + return n, inds + + def getSeparabilityGraph(self, idx="all_inseparable", top_edges=10000): + data = self.Xp_ + if idx == "all_inseparable": # all points are inseparable + selected_idx = np.argwhere(np.all(self.p_alpha_ != 0, axis=1)).max() + elif idx == "selected": # globally selected alpha + selected_idx = (self.n_alpha_ == self.dimension_).tolist().index(True) + elif type(idx) == int: + selected_idx = idx + else: + raise ValueError("unknown idx parameter") + alpha_selected = self._alphas[0, selected_idx] + return self.buildSeparabilityGraph(data, alpha_selected, top_edges=top_edges) + + @staticmethod + def plotSeparabilityGraph(x, y, edges, alpha=0.3): + for i in range(len(edges)): + ii = edges[i][0] + jj = edges[i][1] + plt.plot([x[ii], x[jj]], [y[ii], y[jj]], "k-", alpha=alpha) + + @staticmethod + def buildSeparabilityGraph(data, alpha, top_edges=10000): + """weighted directed separability graph, represented by a list of tuples (point i, point j) and an array of weights + each tuple is the observation that point i is inseparable from j, the weight is /-alpha + + data is a matrix of data which is assumed to be properly normalized + alpha parameter is a signle value in this case + top_edges is the number of edges to return. if top_edges is negative then all edges will be returned + """ + + # Number of points per 1 loop. 20k assumes approx 3.2GB + nP = np.min([2000, len(data)]) + n = len(data) + leng = np.zeros((n, 1)) + + # globalxy = np.zeros((n,n)) + + insep_edges = [] + weights = [] + symmetric_graph = True + symmetry_message = False + + for k in range(0, n, nP): + e = k + nP + if e > n: + e = n + # Calculate diagonal part, divide each row by diagonal element + xy = data[k:e, :] @ data[k:e, :].T + leng[k:e] = np.diag(xy)[:, None] + xy = xy - np.diag(leng[k:e].squeeze()) + xy = xy / leng[k:e] + # if skdim.lid.FisherS.check_symmetric(xy): + if np.allclose(xy, xy.T, rtol=1e-05, atol=1e-08): + # globalxy[k:e,k:e] = np.triu(xy) + for i in range(len(xy)): + for j in range(i + 1, len(xy)): + if xy[i, j] > alpha: + insep_edges.append((k + i, k + j)) + weights.append(xy[i, j] - alpha) + else: + symmetric_graph = False + # globalxy[k:e,k:e] = xy + for i in range(len(xy)): + for j in range(i + 1, len(xy)): + if xy[i, j] > alpha: + insep_edges.append((k + i, k + j)) + weights.append(xy[i, j] - alpha) + # Calculate nondiagonal part + startpoint = 0 + if symmetric_graph: + startpoint = k + if not symmetry_message: + print( + "Graph is symmetric, only upper triangle of the separability matrix will be used" + ) + symmetry_message = True + for kk in range(startpoint, n, nP): + # Ignore diagonal part + if not k == kk: + ee = kk + nP + if ee > n: + ee = n + xy = data[k:e, :] @ data[kk:ee, :].T + xy = xy / leng[k:e] + # globalxy[k:e,kk:ee] = xy + for i in range(ee - kk): + for j in range(ee - kk): + if xy[i, j] > alpha: + insep_edges.append((k + i, kk + j)) + weights.append(xy[i, j] - alpha) + + weights = np.array(weights) + + if top_edges > 0: + if top_edges < len(insep_edges): + weights_sorted = np.sort(weights) + weights_sorted = weights_sorted[::-1] + thresh = weights_sorted[top_edges] + insep_edges_filtered = [] + weights_filtered = [] + for i, w in enumerate(weights): + if w > thresh: + insep_edges_filtered.append(insep_edges[i]) + weights_filtered.append(w) + weights = np.array(weights_filtered) + insep_edges = insep_edges_filtered + + return insep_edges, weights + + @staticmethod + def check_symmetric(a, rtol=1e-05, atol=1e-08): + return np.allclose(a, a.T, rtol=rtol, atol=atol) + + def _SeparabilityAnalysis(self, X): + """ + %Performs standard analysis of separability and produces standard plots. + % + %Inputs: + % X - is a data matrix with one data point in each row. + % Optional arguments in varargin form Name, Value pairs. Possible names: + % 'conditional_number' - a positive real value used to select the top + % princinpal components. We consider only PCs with eigen values + % which are not less than the maximal eigenvalue divided by + % conditional_number Default value is 10. + % 'project_on_sphere' - a boolean value indicating if projecting on a + % sphere should be performed. Default value is true. + % 'Alphas' - a real vector, with alpha range, the values must be given increasing + % within (0,1) interval. Default is [0.6,0.62,...,0.98]. + % 'produce_plots' - a boolean value indicating if the standard plots + % need to be drawn. Default is true. + % 'verbose' - bool, whether to print number of retained principal components + % 'limit_maxdim' bool, whether to cap estimated maxdim to the embedding dimension + %Outputs: + % n_alpha - effective dimension profile as a function of alpha + % n_single - a single estimate for the effective dimension + % p_alpha - distributions as a function of alpha, matrix with columns + % corresponding to the alpha values, and with rows corresponding to + % objects. + % separable_fraction - separable fraction of data points as a function of + % alpha + % alphas - alpha values + """ + npoints = len(X[:, 0]) + # Preprocess data + Xp = self._preprocessing(X, 1, 1, 1) + # Check separability + separable_fraction, p_alpha = self._checkSeparabilityMultipleAlpha(Xp) + # Calculate mean fraction of separable points for each alpha. + py_mean = np.mean(p_alpha, axis=1) + n_alpha, n_single, alpha_single = self._dimension_uniform_sphere(py_mean) + + alpha_ind_selected = np.where(n_single == n_alpha)[0] + + if self.limit_maxdim: + n_single = np.clip(n_single, None, X.shape[1]) + + if self.produce_plots: + # Define the minimal and maximal dimensions for theoretical graph with + # two dimensions in each side + n_min = np.floor(min(n_alpha)) - 2 + n_max = np.floor(max(n_alpha) + 0.8) + 2 + if n_min < 1: + n_min = 1 + + ns = np.arange(n_min, n_max + 1) + + plt.figure() + plt.plot(self._alphas[0, :], n_alpha, "ko-") + plt.plot( + self._alphas[0, alpha_ind_selected], n_single, "rx", markersize=16, + ) + plt.xlabel("\u03B1", fontsize=16) + plt.ylabel("Effective dimension", fontsize=16) + locs, labels = plt.xticks() + plt.show() + nbins = int(round(np.floor(npoints / 200))) + + if nbins < 20: + nbins = 20 + + plt.figure() + plt.hist(p_alpha[alpha_ind_selected, :][0], bins=nbins) + plt.xlabel( + "Inseparability prob. for \u03B1=%2.2f" + % (self._alphas[0, alpha_ind_selected]), + fontsize=16, + ) + plt.ylabel("Number of values") + plt.show() + + plt.figure() + plt.xticks(locs, labels) + pteor = np.zeros((len(ns), len(self._alphas[0, :]))) + for k in range(len(ns)): + for j in range(len(self._alphas[0, :])): + pteor[k, j] = self._probability_inseparable_sphere( + self._alphas[0, j], ns[k] + ) + + for i in range(len(pteor[:, 0])): + plt.semilogy(self._alphas[0, :], pteor[i, :], "-", color="r") + plt.xlim(min(self._alphas[0, :]), 1) + if True in np.isnan(n_alpha): + plt.semilogy( + self._alphas[0, : np.where(np.isnan(n_alpha))[0][0]], + py_mean[: np.where(np.isnan(n_alpha))[0][0]], + "bo-", + "LineWidth", + 3, + ) + else: + plt.semilogy(self._alphas[0, :], py_mean, "bo-", "LineWidth", 3) + + plt.xlabel("\u03B1") + plt.ylabel("Mean inseparability prob.", fontsize=16) + plt.title("Theor.curves for n=%i:%i" % (n_min, n_max)) + plt.show() + + if len(n_single) > 1: + print( + "FisherS selected several dimensions as equally probable. Taking the maximum" + ) + + return ( + n_alpha, + n_single.max(), + p_alpha, + self._alphas, + separable_fraction, + Xp, + ) diff --git a/skdim/id/_PCA.py b/skdim/id/_PCA.py index 73ca9eb..8dd5997 100644 --- a/skdim/id/_PCA.py +++ b/skdim/id/_PCA.py @@ -1,275 +1,275 @@ -# -# BSD 3-Clause License -# -# Copyright (c) 2020, Jonathan Bac -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# 3. Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -import numpy as np -from sklearn.decomposition import PCA -from sklearn.utils.validation import check_array -from .._commonfuncs import GlobalEstimator - - -class lPCA(GlobalEstimator): - # SPDX-License-Identifier: MIT, 2017 Kerstin Johnsson [IDJohnsson]_ - """Intrinsic dimension estimation using the PCA algorithm. [Cangelosi2007]_ [Fan2010]_ [Fukunaga2010]_ [IDJohnsson]_ - - Version 'FO' (Fukunaga-Olsen) returns eigenvalues larger than alphaFO times the largest eigenvalue.\n - Version 'Fan' is the method by Fan et al.\n - Version 'maxgap' returns the position of the largest relative gap in the sequence of eigenvalues.\n - Version 'ratio' returns the number of eigenvalues needed to retain at least alphaRatio of the variance.\n - Version 'participation_ratio' returns the number of eigenvalues given by PR=sum(eigenvalues)^2/sum(eigenvalues^2)\n - Version 'Kaiser' returns the number of eigenvalues above average (the average eigenvalue is 1)\n - Version 'broken_stick' returns the number of eigenvalues above corresponding values of the broken stick distribution\n - - Parameters - ---------- - ver: str, default='FO' - Version. Possible values: 'FO', 'Fan', 'maxgap','ratio', 'Kaiser', 'broken_stick'. - alphaRatio: float in (0,1) - Only for ver = 'ratio'. ID is estimated to be the number of principal components needed to retain at least alphaRatio of the variance. - alphaFO: float in (0,1) - Only for ver = 'FO'. An eigenvalue is considered significant if it is larger than alpha times the largest eigenvalue. - alphaFan: float - Only for ver = 'Fan'. The alpha parameter (large gap threshold). - betaFan: float - Only for ver = 'Fan'. The beta parameter (total covariance threshold). - PFan: float - Only for ver = 'Fan'. Total covariance in non-noise. - verbose: bool, default=False - explained_variance: bool, default=False - If True, lPCA.fit(X) expects as input a precomputed explained_variance vector: X = sklearn.decomposition.PCA().fit(X).explained_variance_ - """ - - def __init__( - self, - ver="FO", - alphaRatio=0.05, - alphaFO=0.05, - alphaFan=10, - betaFan=0.8, - PFan=0.95, - verbose=True, - fit_explained_variance=False, - ): - self.ver = ver - self.alphaRatio = alphaRatio - self.alphaFO = alphaFO - self.alphaFan = alphaFan - self.betaFan = betaFan - self.PFan = PFan - self.verbose = verbose - self.fit_explained_variance = fit_explained_variance - - def fit(self, X, y=None): - """A reference implementation of a fitting function. - - Parameters - ---------- - X : {array-like}, shape (n_samples, n_features) - A local dataset of training input samples. - y : dummy parameter to respect the sklearn API - - Returns - ------- - self : object - Returns self. - """ - if self.fit_explained_variance: - X = check_array(X, ensure_2d=False, ensure_min_samples=2) - else: - X = check_array(X, ensure_min_samples=2, ensure_min_features=2) - - self.dimension_, self.gap_ = self._pcaLocalDimEst(X) - self.is_fitted_ = True - # `fit` should always return `self` - return self - - def _fit_once(self, X, y=None): - """A reference implementation of a fitting function. - Parameters - ---------- - X : {array-like}, shape (n_samples, n_features) - A local dataset of training input samples. - y : dummy parameter to respect the sklearn API - - Returns - ------- - self : object - Returns self. - """ - if self.fit_explained_variance: - X = check_array(X, ensure_2d=False, ensure_min_samples=2) - else: - X = check_array(X, ensure_min_samples=2, ensure_min_features=2) - - self.dimension_, self.gap_ = self._pcaLocalDimEst(X) - self.is_fitted_ = True - # `fit` should always return `self` - return self - - def _pcaLocalDimEst(self, X): - if self.fit_explained_variance: - explained_var = X - else: - pca = PCA().fit(X) - self.explained_var_ = explained_var = pca.explained_variance_ - - if self.ver == "FO": - return self._FO(explained_var) - elif self.ver == "Fan": - return self._fan(explained_var) - elif self.ver == "maxgap": - return self._maxgap(explained_var) - elif self.ver == "ratio": - return self._ratio(explained_var) - elif self.ver == "participation_ratio": - return self._participation_ratio(explained_var) - elif self.ver == "Kaiser": - return self._Kaiser(explained_var) - elif self.ver == "broken_stick": - return self._broken_stick(explained_var) - - def _FO(self, explained_var): - de = sum(explained_var > (self.alphaFO * explained_var[0])) - gaps = explained_var[:-1] / explained_var[1:] - - if de - 1 < len(gaps): - return de, gaps[de - 1] - else: - return de, gaps[-1] - - @staticmethod - def _maxgap(explained_var): - gaps = explained_var[:-1] / explained_var[1:] - de = np.nanargmax(gaps) + 1 - - if de - 1 < len(gaps): - return de, gaps[de - 1] - else: - return de, gaps[-1] - - def _ratio(self, explained_var): - sumexp = np.cumsum(explained_var) - sumexp_norm = sumexp / np.max(sumexp) - de = sum(sumexp_norm < self.alphaRatio) + 1 - gaps = explained_var[:-1] / explained_var[1:] - if de - 1 < len(gaps): - return de, gaps[de - 1] - else: - return de, gaps[-1] - - def _participation_ratio(self, explained_var): - PR = sum(explained_var) ** 2 / sum(explained_var ** 2) - de = PR - gaps = explained_var[:-1] / explained_var[1:] - if de - 1 < len(gaps): - return de, gaps[de - 1] - else: - return de, gaps[-1] - - def _fan(self, explained_var): - r = np.where(np.cumsum(explained_var) / sum(explained_var) > self.PFan)[0][0] - sigma = np.mean(explained_var[r:]) - explained_var -= sigma - gaps = explained_var[:-1] / explained_var[1:] - de = 1 + np.min( - np.concatenate( - ( - np.where(gaps > self.alphaFan)[0], - np.where( - (np.cumsum(explained_var) / sum(explained_var)) > self.betaFan - )[0], - ) - ) - ) - if de - 1 < len(gaps): - return de, gaps[de - 1] - else: - return de, gaps[-1] - - def _Kaiser(self, explained_var): - de = sum(explained_var > np.mean(explained_var)) - gaps = explained_var[:-1] / explained_var[1:] - - if de - 1 < len(gaps): - return de, gaps[de - 1] - else: - return de, gaps[-1] - - @staticmethod - def _brokenstick_distribution(dim): - distr = np.zeros(dim) - for i in range(dim): - for j in range(i, dim): - distr[i] = distr[i] + 1 / (j + 1) - distr[i] = distr[i] / dim - return distr - - def _broken_stick(self, explained_var): - bs = self._brokenstick_distribution(dim=len(explained_var)) - gaps = explained_var[:-1] / explained_var[1:] - de = 0 - explained_var_norm = explained_var / np.sum(explained_var) - for i in range(len(explained_var)): - if bs[i] > explained_var_norm[i]: - de = i + 1 - break - if de - 1 < len(gaps): - return de, gaps[de - 1] - else: - return de, gaps[-1] - - -##### dev in progress -# from sklearn.cluster import KMeans -# def pcaOtpmPointwiseDimEst(data, N, alpha = 0.05): -# km = KMeans(n_clusters=N) -# km.fit(data) -# pt = km.cluster_centers_ -# pt_bm = km.labels_ -# pt_sm = np.repeat(np.nan, len(pt_bm)) -# -# for k in range(len(data)): -# pt_sm[k] = np.argmin(lens(pt[[i for i in range(N) if i!=pt_bm[k]],:] - data[k,:])) -# if (pt_sm[k] >= pt_bm[k]): -# pt_sm[k] += 1 -# -# de_c = np.repeat(np.nan, N) -# nbr_nb_c = np.repeat(np.nan, N) -# for k in range(N): -# nb = np.unique(np.concatenate((pt_sm[pt_bm == k], pt_bm[pt_sm == k]))).astype(int) -# nbr_nb_c[k] = len(nb) -# loc_dat = pt[nb,:] - pt[k,:] -# if len(loc_dat) == 1: -# continue -# de_c[k] = lPCA().fit(loc_dat).dimension_ #pcaLocalDimEst(loc_dat, ver = "FO", alphaFO = alpha) -# -# de = de_c[pt_bm] -# nbr_nb = nbr_nb_c[pt_bm] -# return de, nbr_nb +# +# BSD 3-Clause License +# +# Copyright (c) 2020, Jonathan Bac +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +import numpy as np +from sklearn.decomposition import PCA +from sklearn.utils.validation import check_array +from .._commonfuncs import GlobalEstimator + + +class lPCA(GlobalEstimator): + # SPDX-License-Identifier: MIT, 2017 Kerstin Johnsson [IDJohnsson]_ + """Intrinsic dimension estimation using the PCA algorithm. [Cangelosi2007]_ [Fan2010]_ [Fukunaga2010]_ [IDJohnsson]_ + + Version 'FO' (Fukunaga-Olsen) returns eigenvalues larger than alphaFO times the largest eigenvalue.\n + Version 'Fan' is the method by Fan et al.\n + Version 'maxgap' returns the position of the largest relative gap in the sequence of eigenvalues.\n + Version 'ratio' returns the number of eigenvalues needed to retain at least alphaRatio of the variance.\n + Version 'participation_ratio' returns the number of eigenvalues given by PR=sum(eigenvalues)^2/sum(eigenvalues^2)\n + Version 'Kaiser' returns the number of eigenvalues above average (the average eigenvalue is 1)\n + Version 'broken_stick' returns the number of eigenvalues above corresponding values of the broken stick distribution\n + + Parameters + ---------- + ver: str, default='FO' + Version. Possible values: 'FO', 'Fan', 'maxgap','ratio', 'Kaiser', 'broken_stick'. + alphaRatio: float in (0,1) + Only for ver = 'ratio'. ID is estimated to be the number of principal components needed to retain at least alphaRatio of the variance. + alphaFO: float in (0,1) + Only for ver = 'FO'. An eigenvalue is considered significant if it is larger than alpha times the largest eigenvalue. + alphaFan: float + Only for ver = 'Fan'. The alpha parameter (large gap threshold). + betaFan: float + Only for ver = 'Fan'. The beta parameter (total covariance threshold). + PFan: float + Only for ver = 'Fan'. Total covariance in non-noise. + verbose: bool, default=False + explained_variance: bool, default=False + If True, lPCA.fit(X) expects as input a precomputed explained_variance vector: X = sklearn.decomposition.PCA().fit(X).explained_variance_ + """ + + def __init__( + self, + ver="FO", + alphaRatio=0.05, + alphaFO=0.05, + alphaFan=10, + betaFan=0.8, + PFan=0.95, + verbose=True, + fit_explained_variance=False, + ): + self.ver = ver + self.alphaRatio = alphaRatio + self.alphaFO = alphaFO + self.alphaFan = alphaFan + self.betaFan = betaFan + self.PFan = PFan + self.verbose = verbose + self.fit_explained_variance = fit_explained_variance + + def fit(self, X, y=None): + """A reference implementation of a fitting function. + + Parameters + ---------- + X : {array-like}, shape (n_samples, n_features) + A local dataset of training input samples. + y : dummy parameter to respect the sklearn API + + Returns + ------- + self : object + Returns self. + """ + if self.fit_explained_variance: + X = check_array(X, ensure_2d=False, ensure_min_samples=2) + else: + X = check_array(X, ensure_min_samples=2, ensure_min_features=2) + + self.dimension_, self.gap_ = self._pcaLocalDimEst(X) + self.is_fitted_ = True + # `fit` should always return `self` + return self + + def _fit_once(self, X, y=None): + """A reference implementation of a fitting function. + Parameters + ---------- + X : {array-like}, shape (n_samples, n_features) + A local dataset of training input samples. + y : dummy parameter to respect the sklearn API + + Returns + ------- + self : object + Returns self. + """ + if self.fit_explained_variance: + X = check_array(X, ensure_2d=False, ensure_min_samples=2) + else: + X = check_array(X, ensure_min_samples=2, ensure_min_features=2) + + self.dimension_, self.gap_ = self._pcaLocalDimEst(X) + self.is_fitted_ = True + # `fit` should always return `self` + return self + + def _pcaLocalDimEst(self, X): + if self.fit_explained_variance: + explained_var = X + else: + pca = PCA().fit(X) + self.explained_var_ = explained_var = pca.explained_variance_ + + if self.ver == "FO": + return self._FO(explained_var) + elif self.ver == "Fan": + return self._fan(explained_var) + elif self.ver == "maxgap": + return self._maxgap(explained_var) + elif self.ver == "ratio": + return self._ratio(explained_var) + elif self.ver == "participation_ratio": + return self._participation_ratio(explained_var) + elif self.ver == "Kaiser": + return self._Kaiser(explained_var) + elif self.ver == "broken_stick": + return self._broken_stick(explained_var) + + def _FO(self, explained_var): + de = sum(explained_var > (self.alphaFO * explained_var[0])) + gaps = explained_var[:-1] / explained_var[1:] + + if de - 1 < len(gaps): + return de, gaps[de - 1] + else: + return de, gaps[-1] + + @staticmethod + def _maxgap(explained_var): + gaps = explained_var[:-1] / explained_var[1:] + de = np.nanargmax(gaps) + 1 + + if de - 1 < len(gaps): + return de, gaps[de - 1] + else: + return de, gaps[-1] + + def _ratio(self, explained_var): + sumexp = np.cumsum(explained_var) + sumexp_norm = sumexp / np.max(sumexp) + de = sum(sumexp_norm < self.alphaRatio) + 1 + gaps = explained_var[:-1] / explained_var[1:] + if de - 1 < len(gaps): + return de, gaps[de - 1] + else: + return de, gaps[-1] + + def _participation_ratio(self, explained_var): + PR = sum(explained_var) ** 2 / sum(explained_var ** 2) + de = PR + gaps = explained_var[:-1] / explained_var[1:] + if de - 1 < len(gaps): + return de, gaps[de - 1] + else: + return de, gaps[-1] + + def _fan(self, explained_var): + r = np.where(np.cumsum(explained_var) / sum(explained_var) > self.PFan)[0][0] + sigma = np.mean(explained_var[r:]) + explained_var -= sigma + gaps = explained_var[:-1] / explained_var[1:] + de = 1 + np.min( + np.concatenate( + ( + np.where(gaps > self.alphaFan)[0], + np.where( + (np.cumsum(explained_var) / sum(explained_var)) > self.betaFan + )[0], + ) + ) + ) + if de - 1 < len(gaps): + return de, gaps[de - 1] + else: + return de, gaps[-1] + + def _Kaiser(self, explained_var): + de = sum(explained_var > np.mean(explained_var)) + gaps = explained_var[:-1] / explained_var[1:] + + if de - 1 < len(gaps): + return de, gaps[de - 1] + else: + return de, gaps[-1] + + @staticmethod + def _brokenstick_distribution(dim): + distr = np.zeros(dim) + for i in range(dim): + for j in range(i, dim): + distr[i] = distr[i] + 1 / (j + 1) + distr[i] = distr[i] / dim + return distr + + def _broken_stick(self, explained_var): + bs = self._brokenstick_distribution(dim=len(explained_var)) + gaps = explained_var[:-1] / explained_var[1:] + de = 0 + explained_var_norm = explained_var / np.sum(explained_var) + for i in range(len(explained_var)): + if bs[i] > explained_var_norm[i]: + de = i + 1 + break + if de - 1 < len(gaps): + return de, gaps[de - 1] + else: + return de, gaps[-1] + + +##### dev in progress +# from sklearn.cluster import KMeans +# def pcaOtpmPointwiseDimEst(data, N, alpha = 0.05): +# km = KMeans(n_clusters=N) +# km.fit(data) +# pt = km.cluster_centers_ +# pt_bm = km.labels_ +# pt_sm = np.repeat(np.nan, len(pt_bm)) +# +# for k in range(len(data)): +# pt_sm[k] = np.argmin(lens(pt[[i for i in range(N) if i!=pt_bm[k]],:] - data[k,:])) +# if (pt_sm[k] >= pt_bm[k]): +# pt_sm[k] += 1 +# +# de_c = np.repeat(np.nan, N) +# nbr_nb_c = np.repeat(np.nan, N) +# for k in range(N): +# nb = np.unique(np.concatenate((pt_sm[pt_bm == k], pt_bm[pt_sm == k]))).astype(int) +# nbr_nb_c[k] = len(nb) +# loc_dat = pt[nb,:] - pt[k,:] +# if len(loc_dat) == 1: +# continue +# de_c[k] = lPCA().fit(loc_dat).dimension_ #pcaLocalDimEst(loc_dat, ver = "FO", alphaFO = alpha) +# +# de = de_c[pt_bm] +# nbr_nb = nbr_nb_c[pt_bm] +# return de, nbr_nb From 8211d35dba0f157ce0ca9cd3d6ca8deaefd0a2bf Mon Sep 17 00:00:00 2001 From: j-bac Date: Mon, 30 May 2022 15:46:06 +0200 Subject: [PATCH 2/4] cluster upd --- skdim/cluster.py | 251 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 214 insertions(+), 37 deletions(-) diff --git a/skdim/cluster.py b/skdim/cluster.py index 0c1f7d1..db290a5 100644 --- a/skdim/cluster.py +++ b/skdim/cluster.py @@ -3,9 +3,25 @@ from scipy.cluster.hierarchy import dendrogram import sklearn.cluster from sklearn.base import ClusterMixin, BaseEstimator +from scipy.special import softmax + + +def __dir__(): + return ["getLinkage", "dendrogram", "AgglomerativeClustering", "CutPursuit"] + + +def CV(x, thresh=0.5): + return max(0, np.std(x) - thresh) / np.mean(x) + + +def gaussw(d, sigma=1, axis=0): + # yML = gaussw( + # np.abs(y.flatten() - np.tile(np.arange(int(np.ceil(np.max(y))))[:,None],len(X))), sigma=1, + # ).copy('F') + """ turn distance d into probabilities""" + return softmax((-(d ** 2)) / (sigma ** 2), axis=axis) + -def __dir__(): return ['getLinkage','dendrogram','AgglomerativeClustering','CutPursuit'] - def getLinkage(model): counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) @@ -25,45 +41,118 @@ def getLinkage(model): class AgglomerativeClustering(sklearn.cluster.AgglomerativeClustering): - def __init__(self, - n_clusters=None, - distance_threshold=None, - affinity="euclidean", - memory=None, - connectivity=None, - compute_full_tree='auto', - linkage='ward', - compute_distances=False): - + def __init__( + self, + n_clusters=None, + distance_threshold=None, + affinity="euclidean", + memory=None, + connectivity=None, + compute_full_tree="auto", + linkage="ward", + compute_distances=False, + ): + kwargs = inspect.getargvalues(inspect.currentframe())[-1] - kwargs.pop("self") + kwargs.pop("self") super().__init__() - #super().set_params(**{k:kwargs[k] for k in super().get_params()}) + # super().set_params(**{k:kwargs[k] for k in super().get_params()}) for arg, val in kwargs.items(): setattr(self, arg, val) - - def fit(self,lid): - + + def fit(self, lid): + if len(lid.shape) == 1: - lid = lid.reshape(-1,1) - if not(self.affinity in ['euclidean', 'l1', 'l2']): + lid = lid.reshape(-1, 1) + if not (self.affinity in ["euclidean", "l1", "l2"]): raise ValueError("affinity must be one of 'euclidean', 'l1', 'l2'") if self.connectivity is None: - print("WARNING: connectivity matrix is None."+ - " Clustering will not take spatial relationships into account") + print( + "WARNING: connectivity matrix is None." + + " Clustering will not take spatial relationships into account" + ) if (self.n_clusters is None) and (self.distance_threshold is None): - self.distance_threshold = np.around(np.std(lid),2) - print(f"n_clusters and distance_threshold are both None."+ - " Setting distance_threshold as std(lid) =",round(self.distance_threshold,2)) - + self.distance_threshold = np.around(np.std(lid), 2) + print( + f"n_clusters and distance_threshold are both None." + + " Setting distance_threshold as std(lid) =", + round(self.distance_threshold, 2), + ) + super().fit(lid) - - self.comp_mean_lid = np.array([np.mean(lid[self.labels_==c]) for c in np.unique(self.labels_)]) - return self -class CutPursuit(ClusterMixin,BaseEstimator): - #def __new__(cls): - # try: + self.comp_mean_lid = np.array( + [np.mean(lid[self.labels_ == c]) for c in np.unique(self.labels_)] + ) + return self + + +class CutPursuit(ClusterMixin, BaseEstimator): + """Cut-pursuit algorithms. Minimize functionals over a graph G = (V, E) + + connectivity: scipy sparse CSR matrix + Graph adjacency matrix. Non-zero values will be used as edge weights if edge_weights='adj'. + mode: str, default='l0' + Which cut-pursuit algorithm to use: + 'l0': l0 (weighted contour length) penalization, with a separable loss term. F(x) = f(x) + ||x||_l0 + 'l1_lsx': l1 (weighted total variation) penalization, with a separable loss term and simplex constraints. F(x) = f(x) + ||x||_l1 + i_{simplex}(x) + 'l1_gtv': l1 (weighted total variation) penalization, with quadratic loss term. F(x) = 1/2 ||y - x||^2 + ||x||_l1 + loss: int or float + 0 for linear + 1 for quadratic + 0 < loss < 1 for smoothed Kullback-Leibler + edge_weights: str or float + "conn" to take weights from connectivity matrix + scalar for homogeneous weights + vert_weights: (real) array of length V or empty for no weights + Weights on vertices. + for multidimensional data, weighs the coordinates in the + l1 norms of finite differences; all weights must be strictly positive, + it is advised to normalize the weights so that the first value is unity + cp_dif_tol: float + stopping criterion on iterate evolution; algorithm stops if + relative changes (in Euclidean norm) is less than dif_tol; + 1e-3 is a typical value; a lower one can give better precision + but with longer computational time and more final components + cp_it_max: int + maximum number of iterations (graph cut and subproblem) + 10 cuts solve accurately most problems + max_num_threads: if greater than zero, set the maximum number of threads + used for parallelization with OpenMP + balance_parallel_split: if true, the parallel workload of the split step + is balanced; WARNING: this might trade off speed against optimality + l0_solver: dict + Parameters for the l0 solver + K: number of alternative values considered in the split step + split_iter_num: number of partition-and-update iterations in the split + step + split_damp_ratio: edge weights damping for favoring splitting; edge + weights increase in linear progression along partition-and-update + iterations, from this ratio up to original value; real scalar between 0 + and 1, the latter meaning no damping + kmpp_init_num: number of random k-means initializations in the split step + kmpp_iter_num: number of k-means iterations in the split step + l1_solver: dict + Parameters for the l1 solver + pfdr_rho: relaxation parameter, 0 < rho < 2 + 1 is a conservative value; 1.5 often speeds up convergence + pfdr_cond_min: stability of preconditioning; 0 < cond_min < 1; + corresponds roughly the minimum ratio to the maximum descent metric; + 1e-2 is typical; a smaller value might enhance preconditioning + pfdr_dif_rcd: reconditioning criterion on iterate evolution; + a reconditioning is performed if relative changes of the iterate drops + below dif_rcd; WARNING: reconditioning might temporarily draw minimizer + away from the solution set and give bad subproblem solutions + pfdr_dif_tol: stopping criterion on iterate evolution; algorithm stops if + relative changes (in Euclidean norm) is less than dif_tol + 1e-2*cp_dif_tol is a conservative value + pfdr_it_max: maximum number of iterations + 1e4 iterations provides enough precision for most subproblems + + """ + + # def __new__(cls): + # try: # cls.cutpursuit = __import__('cutpursuit') # return super(CutPursuit,cls).__new__(cls) # except ModuleNotFoundError as err: @@ -72,11 +161,99 @@ class CutPursuit(ClusterMixin,BaseEstimator): # 'pip install git+https://github.com/j-bac/parallel-cut-pursuit') # raise - def __init__(self): - try: - self.cutpursuit = __import__('cutpursuit') + def _check_package(self): + try: + self.cutpursuit = __import__("cutpursuit") except ModuleNotFoundError as err: - print(f'To use this class, install cutpursuit with: \n', - 'conda install gxx-compiler -y \n', - 'pip install git+https://github.com/j-bac/parallel-cut-pursuit') + print( + f"To use this class, install cutpursuit with: \n", + "conda install gxx-compiler -y \n", + "pip install git+https://github.com/j-bac/parallel-cut-pursuit", + ) raise + + def __init__( + self, + connectivity, + mode="l0", + loss=1, + min_comp_weight=0.0, + edge_weights=1.0, + vert_weights=None, + coor_weights=None, + cp_dif_tol=0.0001, + cp_it_max=10, + max_num_threads=0, + balance_parallel_split=True, + l0_solver=dict( + K=2, + split_iter_num=2, + split_damp_ratio=1.0, + kmpp_init_num=3, + kmpp_iter_num=3, + ), + l1_solver=dict( + pfdr_rho=1.0, + pfdr_cond_min=0.01, + pfdr_dif_rcd=0.0, + pfdr_dif_tol=None, + pfdr_it_max=10000, + ), + ): + self._check_package() + kwargs = inspect.getargvalues(inspect.currentframe())[-1] + kwargs.pop("self") + + for arg, val in kwargs.items(): + if arg == "edge_weights": + if arg == "conn": + self.edge_weights = connectivity.data + else: # None or real number + self.edge_weights = edge_weights + else: + setattr(self, arg, val) + + def fit(self, X): + """ + X: observations, (real) D-by-V array + For Kullback-Leibler loss, the value at each vertex must lie on the probability simplex + careful to the internal memory representation of multidimensional + arrays; the C++ implementation uses column-major order (F-contiguous); + usually numpy uses row-major order (C-contiguous), but this can often + be taken care of without actually copying data by using transpose(); + """ + + if self.mode == "l0": + self.components_, self.components_values_ = self.cutpursuit.cp_kmpp_d0_dist( + loss=self.loss, + Y=X, + first_edge=self.connectivity.indptr.astype("uint32"), + adj_vertices=self.connectivity.indices.astype("uint32"), + edge_weights=self.edge_weights, + vert_weights=self.vert_weights, + coor_weights=self.coor_weights, + cp_dif_tol=self.cp_dif_tol, + cp_it_max=self.cp_it_max, + min_comp_weight=self.min_comp_weight, + max_num_threads=self.max_num_threads, + balance_parallel_split=self.balance_parallel_split, + **self.l0_solver, + ) + + if self.mode == "l1": + self.components_, self.components_values_ = self.cutpursuit.cp_prox_tv( + Y=X, + first_edge=self.connectivity.indptr.astype("uint32"), + adj_vertices=self.connectivity.indices.astype("uint32"), + edge_weights=self.edge_weights, + cp_dif_tol=self.cp_dif_tol, + cp_it_max=self.cp_it_max, + max_num_threads=self.max_num_threads, + balance_parallel_split=self.balance_parallel_split, + **self.l1_solver, + ) + + return self + + def get_confidence(self): + pass From 110680afa05af72de7257fefb5da1d1eb2d40376 Mon Sep 17 00:00:00 2001 From: j-bac Date: Sun, 11 Sep 2022 12:31:05 +0200 Subject: [PATCH 3/4] update --- .gitignore | 9 +++++++++ skdim/_commonfuncs.py | 2 +- skdim/id/_PCA.py | 23 ----------------------- 3 files changed, 10 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index d5ff153..b908cff 100644 --- a/.gitignore +++ b/.gitignore @@ -67,3 +67,12 @@ doc/generated/ # PyBuilder target/ +.idea/inspectionProfiles/profiles_settings.xml +.gitignore +.idea/other.xml +.gitignore +.gitignore +.gitignore +.idea/modules.xml +.idea/scikit-dimension.iml +.idea/vcs.xml diff --git a/skdim/_commonfuncs.py b/skdim/_commonfuncs.py index 14c9a0e..aeaddab 100644 --- a/skdim/_commonfuncs.py +++ b/skdim/_commonfuncs.py @@ -211,7 +211,7 @@ def fit_pw(self, X, precomputed_knn=None, smooth=False, n_neighbors=100, n_jobs= if smooth: self.dimension_pw_smooth_ = np.zeros(len(knnidx)) for i, point_nn in enumerate(knnidx): - self.dimension_pw_smooth_[i] = np.mean( + self.dimension_pw_smooth_[i] = np.nanmean( np.append(self.dimension_pw_[i], self.dimension_pw_[point_nn]) ) return self diff --git a/skdim/id/_PCA.py b/skdim/id/_PCA.py index 8dd5997..4ec167b 100644 --- a/skdim/id/_PCA.py +++ b/skdim/id/_PCA.py @@ -110,29 +110,6 @@ def fit(self, X, y=None): # `fit` should always return `self` return self - def _fit_once(self, X, y=None): - """A reference implementation of a fitting function. - Parameters - ---------- - X : {array-like}, shape (n_samples, n_features) - A local dataset of training input samples. - y : dummy parameter to respect the sklearn API - - Returns - ------- - self : object - Returns self. - """ - if self.fit_explained_variance: - X = check_array(X, ensure_2d=False, ensure_min_samples=2) - else: - X = check_array(X, ensure_min_samples=2, ensure_min_features=2) - - self.dimension_, self.gap_ = self._pcaLocalDimEst(X) - self.is_fitted_ = True - # `fit` should always return `self` - return self - def _pcaLocalDimEst(self, X): if self.fit_explained_variance: explained_var = X From 630ee1bb453e6fe21d86cd9c68e46a08ecb6a3b8 Mon Sep 17 00:00:00 2001 From: j-bac Date: Tue, 25 Oct 2022 17:04:28 +0200 Subject: [PATCH 4/4] subspace --- skdim/_commonfuncs.py | 30 +- skdim/subspace/embedding.py | 52 ++++ skdim/subspace/grassmann_distances.py | 315 +++++++++++++++++++++ skdim/subspace/main.py | 391 ++++++++++++++++++++++++++ skdim/subspace/moments.py | 160 +++++++++++ skdim/subspace/utils.py | 216 ++++++++++++++ 6 files changed, 1143 insertions(+), 21 deletions(-) create mode 100644 skdim/subspace/embedding.py create mode 100644 skdim/subspace/grassmann_distances.py create mode 100644 skdim/subspace/main.py create mode 100644 skdim/subspace/moments.py create mode 100644 skdim/subspace/utils.py diff --git a/skdim/_commonfuncs.py b/skdim/_commonfuncs.py index aeaddab..fd374de 100644 --- a/skdim/_commonfuncs.py +++ b/skdim/_commonfuncs.py @@ -273,30 +273,18 @@ def fit_transform_pw( Smoothed pointwise ID estimates returned if self.fit_pw(smooth=True) """ - X = check_array(X, ensure_min_samples=n_neighbors + 1, ensure_min_features=2) - - if precomputed_knn is not None: - knnidx = precomputed_knn - else: - _, knnidx = get_nn(X, k=n_neighbors, n_jobs=n_jobs) - - if n_jobs > 1: - pool = mp.Pool(n_jobs) - results = pool.map(self.fit, [X[i, :] for i in knnidx]) - pool.close() - dimension_pw_ = np.array([r.dimension_ for r in results]) - else: - dimension_pw_ = np.array([self.fit(X[i, :]).dimension_ for i in knnidx]) + self.fit_pw( + X, + precomputed_knn=precomputed_knn, + smooth=smooth, + n_neighbors=n_neighbors, + n_jobs=n_jobs, + ) if smooth: - dimension_pw_smooth_ = np.zeros(len(knnidx)) - for i, point_nn in enumerate(knnidx): - dimension_pw_smooth_[i] = np.mean( - np.append(dimension_pw_[i], dimension_pw_[point_nn]) - ) - return dimension_pw_, dimension_pw_smooth_ + return self.dimension_pw_, self.dimension_pw_smooth_ else: - return dimension_pw_ + return self.dimension_pw_ class LocalEstimator(BaseEstimator): # , metaclass=DocInheritorBase): diff --git a/skdim/subspace/embedding.py b/skdim/subspace/embedding.py new file mode 100644 index 0000000..26fbad0 --- /dev/null +++ b/skdim/subspace/embedding.py @@ -0,0 +1,52 @@ +import numpy as np +import numba as nb + +@nb.njit +def _triu_to_flat(X): + m = len(X) + flat = [] + for i in range(m): + for j in range(i,m): + flat.append(X[i,j]) + return np.array(flat) + +@nb.njit +def _embed_conway(subspace): + proj = subspace @ subspace.T + proj_triu_flat = _triu_to_flat(proj) + return proj_triu_flat + +def embed_conway(subspace): + m = subspace.shape[0] + d = subspace.shape[1] + radius = np.sqrt(d*(m - d)/m) + dim = int((m-1)*(m+2)/2) + proj = _embed_conway(subspace) + return proj,m,d,radius,dim + +def unembed_conway(proj): + #Given an embedding of a subspace p,return the projection matrix + dim = int((np.sqrt(8*len(proj)+1)-1)/2) + P = np.zeros((dim,dim)) + P[np.triu_indices_from(P)]=proj + P += P.T - np.diag(np.diag(P)) + return P + +@nb.njit +def embed_conway_loop(subspaces,normalize=True): + + m = subspaces[0].shape[0] + n = len(subspaces) + Im = np.diag(np.ones(m)/2) + sphere_all_center = _triu_to_flat(Im) + + projs = np.zeros((n,len(sphere_all_center))) + + for i,subspace in enumerate(subspaces): + projs[i] = _embed_conway(subspace) + + projs -= sphere_all_center + if normalize: + projs /= np.sqrt(np.sum(projs**2,axis=1)).reshape(-1,1) + return projs, sphere_all_center + diff --git a/skdim/subspace/grassmann_distances.py b/skdim/subspace/grassmann_distances.py new file mode 100644 index 0000000..071b3a5 --- /dev/null +++ b/skdim/subspace/grassmann_distances.py @@ -0,0 +1,315 @@ +import itertools +import numpy as np +import numba as nb +import pandas as pd +try: + import jax + import jax.numpy as jnp +except: + pass +from functools import partial + +def grassmann_distance_matrix(subspaces,ij=None,metric='geodesic',contrast_unequal=True,gpu=True,max_nbytes = 1e8,max_rank=None,return_angles=True): + + dist_func = DISTANCE_FUNCTIONS[metric] + xp = jnp if gpu else np + + + n = len(subspaces) + length = len(subspaces[0]) + ranks = np.array([subs.shape[1] for subs in subspaces]) + if max_rank is None: + max_rank = int(xp.max(ranks)) + elif max_rank > int(xp.max(ranks)): + raise ValueError('max_rank must be <= to the maximum rank of provided matrices') + else: + ranks = np.clip(ranks,None,max_rank) + dtype_size = subspaces[0].dtype.itemsize + blocksize = int((max_nbytes*dtype_size)/(length*max_rank*dtype_size)) + + # pad input subspaces to concatenate them and vectorize computations + subspaces_pad = np.zeros((n,length,max_rank)) + for r0,rank in enumerate(ranks): + subspaces_pad[r0,:,:rank] = subspaces[r0][:,:rank] + subspaces_pad = xp.array(subspaces_pad) + subspaces_padT = xp.transpose(subspaces_pad, axes=(0,2,1)) + + # loop over blocks of indices + if ij is not None: + i, j = ij[0], ij[1] + else: + i, j = np.triu_indices(n, k=1) + + pdist = np.full(len(i),np.nan) + thetas = [] + + print('Computing distances... 0%',end='\r') + for start in range(0, len(pdist), blocksize): + # Define last element for calculation + end = min(start + blocksize, len(pdist)) + + AT = subspaces_padT[i[start:end]] + B = subspaces_pad[j[start:end]] + + if gpu: + theta = np.asarray(_principal_angles_jax(AT,B)) + else: + theta =_principal_angles(AT,B) + + if return_angles: + thetas.append(theta) + + pdist[start:end] = _reduce_to_rank_and_get_dist(theta,i[start:end],j[start:end],ranks,contrast_unequal,dist_func) + print(f"Computing distances... {end/len(pdist):.0%}",end='\r') + if return_angles: + return pdist, thetas + else: + return pdist + +def grassmann_distance_matrix_precomputed_angles(subspaces,thetas,ij=None,metric='geodesic',gpu=True,contrast_unequal=True,max_nbytes = 1e8,max_rank=None): + + dist_func = DISTANCE_FUNCTIONS[metric] + xp = jnp if gpu else np + + + n = len(subspaces) + length = len(subspaces[0]) + ranks = np.array([subs.shape[1] for subs in subspaces]) + if max_rank is None: + max_rank = int(xp.max(ranks)) + elif max_rank > int(xp.max(ranks)): + raise ValueError('max_rank must be <= to the maximum rank of provided matrices') + else: + ranks = np.clip(ranks,None,max_rank) + dtype_size = subspaces[0].dtype.itemsize + blocksize = int((max_nbytes*dtype_size)/(length*max_rank*dtype_size)) + + # loop over blocks of indices + if ij is not None: + i, j = ij[0], ij[1] + else: + i, j = np.triu_indices(n, k=1) + pdist = np.full(len(i),np.nan) + + print('Computing distances... 0%',end='\r') + for start in range(0, len(pdist), blocksize): + # Define last element for calculation + end = min(start + blocksize, len(pdist)) + theta = thetas[start:end] + pdist[start:end] = _reduce_to_rank_and_get_dist(theta,i[start:end],j[start:end],ranks,contrast_unequal,dist_func) + print(f"Computing distances... {end/len(pdist):.0%}",end='\r') + return pdist + +def conway_distance_matrix(subs_conway,ij=None,metric='geodesic_conway',gpu=True,max_nbytes = 1e8,): + + if gpu: metric+='_gpu' + dist_func = DISTANCE_FUNCTIONS[metric] + + n = len(subs_conway) + length = subs_conway.shape[1] + + dtype_size = subs_conway.dtype.itemsize + blocksize = int((max_nbytes*dtype_size)/(length*dtype_size)) + + # loop over blocks of indices + if ij is not None: + i, j = ij[0], ij[1] + else: + i, j = np.triu_indices(n, k=1) + pdist = np.full(len(i),np.nan) + + print('Computing distances... 0%',end='\r') + for start in range(0, len(pdist), blocksize): + # Define last element for calculation + end = min(start + blocksize, len(pdist)) + + cA = subs_conway[i[start:end]] + cB = subs_conway[j[start:end]] + pdist[start:end] = dist_func(cA,cB) + + print(f"Computing distances... {end/len(pdist):.0%}",end='\r') + + return pdist + +@nb.njit +def _reduce_to_rank_and_get_dist(theta,iblock,jblock,ranks,contrast_unequal,dist_func): + + blockdist=[] + for p in range(len(iblock)): + ranki, rankj = ranks[iblock[p]],ranks[jblock[p]] + _theta = theta[p,:min(ranki,rankj)] + + if contrast_unequal: + blockdist.append(dist_func(_theta,ranki,rankj)) + else: + blockdist.append(dist_func(_theta,0,0)) + return blockdist + +def _pad_and_concatenate(subspaces,shape,ranks=None): + '''Add zeros to a list of subspaces along last axis so they can be concatenated ''' + if ranks is None: + ranks = [s.shape[1] for s in subspaces] + subspaces_pad = np.zeros(shape) + for r0,rank in enumerate(ranks): + subspaces_pad[r0,:,:rank] = subspaces[r0][:,:rank] + return subspaces_pad + +##### Principal Angles ##### + +@jax.jit +def _principal_angles_jax(AT,B): + M = jnp.matmul(AT, B) + S = jnp.linalg.svd(M, compute_uv=False) + S = jnp.clip(S,None,1.) + return jnp.arccos(S) + +def _principal_angles(AT,B): + M = AT @ B + S = np.linalg.svd(M, compute_uv=False) + S = np.clip(S,None,1.) + return np.arccos(S) + + + +##### Distance from frincipal Angles ##### + +@nb.njit +def asimov(theta,rank_i,rank_j): + return np.max(theta) + +@nb.njit +def asimov_reverse(theta,rank_i,rank_j): + return np.min(theta) + +@nb.njit +def binet_cauchy(theta,rank_i,rank_j): + return np.sqrt(1 - np.prod(np.cos(theta) ** 2)) + +@nb.njit +def chordal(theta,rank_i,rank_j): + contrast = abs(rank_i - rank_j) + return np.sqrt(contrast + np.sum(np.sin(theta) ** 2)) + +@nb.njit +def fubini_study(theta,rank_i,rank_j): + return np.arccos(np.prod(np.cos(theta))) + +@nb.njit +def geodesic(theta,rank_i,rank_j): + contrast = abs(rank_i - rank_j) * np.pi ** 2 / 4 + return np.sqrt(contrast + np.sum(theta ** 2)) + +@nb.njit +def martin(theta,rank_i,rank_j): + sys_float_info_min = 2.2250738585072014e-308 + cos_sq = np.cos(theta) ** 2 + cos_sq[np.where(cos_sq < sys_float_info_min)] = sys_float_info_min + return np.sqrt(np.log(np.prod(np.reciprocal(cos_sq)))) + +@nb.njit +def procrustes(theta,rank_i,rank_j): + contrast = abs(rank_i - rank_j) + return 2 * np.sqrt(contrast + np.sum(np.sin(theta / 2) ** 2)) + +@nb.njit +def projection(theta,rank_i,rank_j): + contrast = abs(rank_i - rank_j) + return np.sqrt(contrast + np.sum(np.sin(theta))) + +@nb.njit +def spectral(theta,rank_i,rank_j): + return 2 * np.sin(np.max(theta) / 2) + + + +##### Distance from Conway embedding ##### +def diag_indices_from_triu_flat_jax(n): + return jnp.cumsum(jnp.append(0,jnp.arange(n,1,-1))) + +@nb.njit +def diag_indices_from_triu_flat(n): + return np.cumsum(np.append(0,np.arange(n,1,-1))) + +def euclidean_conway_jax(cA,cB,abs_dot=False): + diffs = jnp.sum((cA-cB)**2) + if abs_dot: + #w unit vectors cosine = 1 - sqeucl/2 ---> sqeucl > 2 = negative cosine + idx = diffs > 2 + diffs[idx] = jnp.sum((cA[idx]+cB[idx])**2) + return jnp.sqrt(diffs,axis=-1) + +@nb.njit +def euclidean_conway(cA,cB,abs_dot=False): + diffs = np.sum((cA-cB)**2) + if abs_dot: + #w unit vectors cosine = 1 - sqeucl/2 ---> sqeucl > 2 = negative cosine + idx = diffs > 2 + diffs[idx] = np.sum((cA[idx]+cB[idx])**2) + return np.sqrt(diffs,axis=-1) + +def chordal_conway_jax(cA,cB): + dim = int((jnp.sqrt(8*cA.shape[1]+1)-1)/2) + diff = (cA-cB)**2 + diff[:,diag_indices_from_triu_flat_jax(dim)]/=2 + return jnp.sqrt(jnp.sum(2*diff,axis=-1))/jnp.sqrt(2) + +@nb.njit +def chordal_conway(cA,cB): + dim = int((np.sqrt(8*cA.shape[1]+1)-1)/2) + diff = (cA-cB)**2 + diff[:,diag_indices_from_triu_flat(dim)]/=2 + return np.sqrt(np.sum(2*diff,axis=-1))/np.sqrt(2) + +def geodesic_conway_jax(cA,cB,abs_dot=False): + norms_A = jnp.linalg.norm(cA,axis=-1) + norms_B = jnp.linalg.norm(cB,axis=-1) + inner_prod = jnp.sum(jnp.multiply(cA,cB),axis=-1) + if abs_dot: + inner_prod = jnp.abs(inner_prod) + cos_angle = inner_prod / (norms_A*norms_B) + cos_angle = jnp.clip(cos_angle, -1, 1) + return jnp.arccos(cos_angle) + +@nb.njit +def geodesic_conway(cA,cB,abs_dot=False): + norms_A = np.sqrt(np.sum(cA**2,axis=-1)) + norms_B = np.sqrt(np.sum(cB**2,axis=-1)) + inner_prod = np.sum(cA*cB,axis=-1) + if abs_dot: + inner_prod = np.abs(inner_prod) + cos_angle = inner_prod / (norms_A*norms_B) + cos_angle = np.clip(cos_angle, -1, 1) + return np.arccos(cos_angle) + +euclidean_conway_abs = partial(euclidean_conway,abs_dot=True) +euclidean_conway_abs_jax = jax.jit(partial(euclidean_conway_jax,abs_dot=True)) +euclidean_conway_jax = jax.jit(euclidean_conway_jax) + +geodesic_conway_abs = partial(geodesic_conway,abs_dot=True) +geodesic_conway_abs_jax = jax.jit(partial(geodesic_conway_jax,abs_dot=True)) +geodesic_conway_jax = jax.jit(geodesic_conway_jax) + +DISTANCE_FUNCTIONS = { +'asimov':asimov, +'asimov_reverse':asimov_reverse, +'binet_cauchy':binet_cauchy, +'chordal':chordal, +'fubini_study':fubini_study, +'geodesic':geodesic, +'martin':martin, +'procrustes':procrustes, +'projection':projection, +'spectral':spectral, + +'euclidean_conway':euclidean_conway, +'euclidean_conway_gpu':euclidean_conway_jax, +'euclidean_conway_abs':euclidean_conway_abs, +'euclidean_conway_abs_gpu':euclidean_conway_abs_jax, +'chordal_conway':chordal_conway, +'chordal_conway_gpu':chordal_conway_jax, +'geodesic_conway':geodesic_conway, +'geodesic_conway_gpu':geodesic_conway_jax, +'geodesic_conway_abs':geodesic_conway_abs, +'geodesic_conway_abs_gpu':geodesic_conway_abs_jax, + +} \ No newline at end of file diff --git a/skdim/subspace/main.py b/skdim/subspace/main.py new file mode 100644 index 0000000..89964da --- /dev/null +++ b/skdim/subspace/main.py @@ -0,0 +1,391 @@ +import numpy as np +import skdim +from sklearn.neighbors import NearestNeighbors +from sklearn.decomposition import PCA +from . import grassmann_distances, embedding, moments + +class Subspaces: + ''' Class to store and compute distances between subspaces (e.g., principal axes) + + Parameters + ---------- + n: int + embedding dimension + + Attributes + ---------- + n_points_: int + number of points stored in the class instance + ranks_: list + rank of each subspace + unique_ranks_: np.ndarray + unique ranks of subspaces + subspaces_: list[np.ndarray] + list of ordered orthonormal bases stored as column vectors of shape (n x ranks[i]). + sigmas_: list[np.ndarray] + eigenvalues of each subspace (if obtained from PCA or provided) + means_: list[np.ndarray] + means of subspaces (if obtained from PCA or provided) + + ''' + + def __init__(self,n): + self.n = n + self.n_points_ = 0 + self.unique_ranks_ = np.array([]) + self.ranks_, self.subspaces_, self.sigmas_, self.means_ = [], [], [], [] + + def __repr__(self): + return (f"Subspaces(n={self.n}, n_points_={self.n_points_}, unique_ranks_={self.unique_ranks_})") + + @classmethod + def from_projection(cls,data,ranks='auto'): + '''Initialize by projection (centering + SVD of each array in data) + + Parameters + ---------- + data: list[np.ndarray] + Arrays to project (n_points[i] x self.n) + ranks: str or int or iterable[int] + Method to determine the retained rank: + if 'auto': automatically determine the rank + if an integer: all arrays are projected to rank=ranks + if an iterable: data[i] is projected to rank=ranks[i] + ''' + return cls(data[0].shape[1]).add(data,project=True,ranks=ranks) + + @classmethod + def from_subspaces(cls,data,sigmas=None,rtol=1e-05, atol=1e-07): + '''Initialize by projection (centering + SVD of each array in data) + + Parameters + ---------- + data: list[np.ndarray] + Arrays with orthonormal columns to add + sigmas: list[np.ndarray] + Variance explained by each component of the array + rtol: float + Relative tolerance on the orthonormal requirement (see np.allclose doc) + atol: float + Absolute tolerance on the orthonormal requirement (see np.allclose doc) + ''' + return cls(data[0].shape[0]).add(data,project=False,sigmas=sigmas,rtol=rtol, atol=atol) + + def add(self,data,project=True,ranks='auto',sigmas=None,rtol=1e-05, atol=1e-07): + '''Add new arrays to project or new subspaces + + data: list[np.ndarray] + Arrays with orthonormal columns to add + project: bool + If True, each array is projected to a subspace by centering + SVD + If False, arrays are assumed to be subspaces already + ranks: str or int or iterable[int] + Method to determine the retained rank: + if 'auto': automatically determine the rank + if an integer: all arrays are projected to rank=ranks + if an iterable: data[i] is projected to rank=ranks[i] + sigmas: list[np.ndarray] + Variance explained by each component of the array + ''' + self._check_shape(data,project) + + if project: + self._add_projections(data,ranks=ranks) + else: + self._add_subspaces(data,sigmas=sigmas,rtol=rtol, atol=atol) + + self._store_attributes() + return self + + + def embed_conway(self,normalize=False,max_rank=None): + ''' Spherical embedding of subspaces. + + Isometric embedding of rank k-dimensional subspaces in R^n as points + on a sphere in dimension (n − 1)(n + 2)/2. Chordal distance becomes euclidean distance + and geodesic distance becomes spherical distance + + "Packing Lines, Planes, etc.: Packings in Grassmannian Spaces", + J. H. Conway, R. H. Hardin and N. J. A. Sloane + + Parameters + ---------- + normalize: bool + If True, return sphere normalized to radius=1 + max_rank: int + Clip subspaces rank to max_rank for computation before embedding + + Returns + ------- + self.subspaces_conway_: np.array + Conway embedding of subspaces, centered at zero + self.embed_conway_max_rank_: int + Stores max_rank parameter + self.embed_conway_orig_center: + Theoretical (original) center of the sphere + ''' + if max_rank is not None: + subs = [np.ascontiguousarray(s[:,:max_rank]) for s in self.subspaces_] + else: + subs = self.subspaces_ + + projs, orig_center = embedding.embed_conway_loop(subs,normalize=normalize) + + self.subspaces_conway_ = projs + self.embed_conway_max_rank = max_rank + self.embed_conway_normalize = normalize + self.embed_conway_orig_center_ = orig_center + + def grassmann_distances(self,Subspaces=None,metric='geodesic',contrast_unequal=True, + gpu=True,max_nbytes = 1e8,max_rank=None,return_angles=True): + ''' Compute Grassmannian distance matrix + from principal angles or Conway's spherical embedding (see Subspaces.embed_conway) + + Parameters + ---------- + Subspaces: subspace.Subspaces + Another Subspaces instance to compute distance matrix against + (with shape self.n_points_ x Subspaces.n_points_). + If provided, outputs will be directly returned + metric: str + Which Grassmannian metric to use: + Distances from principal angles: + 'asimov', 'binet_cauchy','chordal','fubini_study', + 'geodesic','martin','procrutes','projection','spectral' + Distances from Conway embedding: + 'euclidean_conway': euclidean distance (~= proportional to 'chordal') + 'geodesic_conway_abs': euclidean distance, flipping vectors with negative cosine similarity + 'chordal_conway': euclidean distance with scaled off-diagonal terms (='chordal') + 'geodesic_conway': distance on the surface of Conway sphere (~= proportional to 'geodesic') + 'geodesic_conway_abs': distance on the surface of Conway sphere, flipping vectors with negative cosine similarity + + contrast_unequal: bool + Whether to contrast the distance between subspaces of different dimensions + as proposed in + "Schubert varieties and distances between subspaces of different dimensions", + Ke Ye & Lek-Heng Lim. SIAM J. MATRIX ANAL. APPL. 2016 Vol. 37, No. 3, pp. 1176–1197 + gpu: bool + If True, computations are done on GPU using jax.numpy, otherwise numpy + max_nbytes: + Maximum block size, in bytes, to compute at once + max_rank: int + Clip subspaces rank to max_rank for computation of distances + return_angles: + Whether to return principal angles + + Returns + ------- + self.pdist_: np.ndarray + Grassmann distance matrix, in pdist format + (use scipy.spatial.distance.squareform to get square format) + ''' + #compute X,Y else X,X distance matrix + if Subspaces is None: + self_, ij = self, None + else: + self_ = self.from_subspaces(self.subspaces_ + Subspaces.subspaces_,rtol=10,atol=10) + n, m = self.n_points_, Subspaces.n_points_ + i, j = np.ones((n,m)).nonzero() + ij = (i, j + n) # compute only last m columns of concatenated subspace list + + # conway embedding distances + if 'conway' in metric: + if not contrast_unequal: + raise ValueError('contrast_unequal=False not supported when using Conway embedding.') + + if not hasattr(self_,'subspaces_conway_') or self_.embed_conway_max_rank != max_rank: + self_.embed_conway(max_rank=max_rank) + + self_.pdist_ = grassmann_distances.conway_distance_matrix(self_.subspaces_conway_,ij=ij,metric=metric,gpu=gpu,max_nbytes=max_nbytes) + + # principal angle distances + else: + kwargs = dict(subspaces=self_.subspaces_,ij=ij,metric=metric, + contrast_unequal=contrast_unequal, + gpu=gpu,max_nbytes = max_nbytes,max_rank=max_rank, + return_angles=return_angles) + + if not(hasattr(self_,'thetas_')) or len(self_.thetas_) != len(self_.subspaces_): + func = grassmann_distances.grassmann_distance_matrix + else: + func = grassmann_distances.grassmann_distance_matrix_precomputed_angles + kwargs['thetas'] = self_.thetas_ + return_angles = False + + if return_angles: + self_.pdist_, self_.thetas_ = func(**kwargs) + else: + self_.pdist_ = func(**kwargs) + + if Subspaces is not None: + if return_angles: + return self_.pdist_.reshape(n,m), self_.thetas_ + else: + return self_.pdist_.reshape(n,m) + + self_.grassmann_distances_metric=metric + self_.grassmann_distances_max_rank=max_rank + self_.grassmann_distances_contrast_unequal=contrast_unequal + + def grassmann_knn(self,n_neighbors=5,algorithm='auto',leaf_size=30,n_jobs=1, + metric='euclidean_conway',max_rank=None): + + ''' + Compute Grassmannian kNN from Conway's spherical embedding (see Subspaces.embed_conway) + + Parameters + ---------- + metric: str + Which Grassmannian metric to use from 'chordal_conway', 'geodesic_conway' + max_rank: int + Clip subspaces rank to max_rank for computation of distances + Other parameters are passed to sklearn.neighbors.NearestNeighbors + + Returns + ------- + self.grassmann_knndis_ + self.grassmann_knnidx_ + self.grassmann_knn_metric + self.grassmann_knn_max_rank + + ''' + + if metric not in ['euclidean_conway','chordal_conway','spherical_conway','geodesic_conway']: + raise ValueError("Metric must be on the Conway sphere. See self.grassmann_distances ") + if not hasattr(self,'subspaces_conway_') or self.embed_conway_max_rank != max_rank: + self.embed_conway(max_rank) + + if metric != 'euclidean_conway': + dist_fun = grassmann_distances.DISTANCE_FUNCTIONS[metric] + else: + dist_fun = 'minkowski' + + self.grassmann_knndis_, self.grassmann_knnidx_ = NearestNeighbors( + metric=dist_fun, + n_neighbors=n_neighbors, + algorithm=algorithm, + leaf_size=leaf_size, + n_jobs=n_jobs,).fit(self.subspaces_conway_).kneighbors() + + self.grassmann_knn_metric = metric + self.grassmann_knn_max_rank = max_rank + + def mean(self,kind='flag'): + ''' + Compute mean of subspaces. + Subspaces need to all be of same rank except for kind='flag' + + Parameters + ---------- + kind: str + Which mean formula to apply + Returns + ------- + self.mean_: np.array + Mean subspace basis + self.svals_: np.array + Singular values associated with the mean + + ''' + if kind != 'flag': + raise ValueError('Only flag mean is currently implemented') + self.mean_, self.mean_svals_ = moments.flag_mean(self.subspaces_) + + def median(self,kind='irls',rank=None,n_its=100): + ''' + Compute median of subspaces. + Subspaces need to all be of same rank except for kind='irls' + + Mankovich, Nathan, et al. "The Flag Median and FlagIRLS.", 2022. + Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. + + Parameters + ---------- + kind: str + Which mean formula to apply + rank: int + Rank of median subspace to compute. Default is max(self.ranks) + n_its: int + Number of iterations + + Returns + ------- + self.median_: np.array + Mean subspace basis + self.median_err_: list + Objective function error + + ''' + if kind != 'irls': + raise ValueError('Only flag irls median is currently implemented') + if rank is None: + rank = max(self.ranks_) + self.median_, self.median_err_ = moments.irls_flag(self.subspaces_,r=rank,n_its=n_its) + + def flatten(self): + ''' + Returns a copy where basis vectors + for each subspace are considered as different 1-d subspaces. + ''' + data = [np.ascontiguousarray(v.reshape(-1,1)) for s in self.subspaces_ for v in s.T] + return self.from_subspaces(data=data) + + def flip_sign(self): + ''' + Harmonize basis vector directions + by multiplying with the sign of the first coordinate + ''' + for s in self.subspaces_: + s*=np.sign(s[[0]]) + + def _check_shape(self,data,project): + if project and len(np.unique([d.shape[1] for d in data])) != 1: + raise ValueError('embedding dimension (data[i].shape[1]) ' + 'must be equal for all arrays in data') + elif not project and len(np.unique([d.shape[0] for d in data])) != 1: + raise ValueError('embedding dimension (data[i].shape[0]) ' + 'must be equal for all arrays in data') + + def _check_point(self,array,rtol=1e-05, atol=1e-07): + if not (array.shape[0]==self.n and array.ndim == 2 and ('float' in array.dtype.name) and + np.allclose(array.T @ array, np.eye(array.shape[1]),rtol=rtol,atol=atol)): + raise ValueError('point(s) do not belong to the manifold') + + def _add_projections(self,data,ranks): + + if isinstance(ranks, int): + ranks = np.repeat(ranks,len(data)) + elif isinstance(ranks, str) and ranks == "auto": + ranks = np.repeat(None,len(data)) + elif len(ranks) == len(data): + pass + else: + raise ValueError("The input parameter ranks must be either" + "'auto', an integer, or an array of integers with same length as data") + + for X,rank in zip(data,ranks): + pca = PCA(n_components=rank).fit(X) + if rank is None: + rank = skdim.id.lPCA(alphaFO=.1,fit_explained_variance=True).fit_transform(pca.explained_variance_) + self.subspaces_.append(np.ascontiguousarray(pca.components_[:rank].T)) + self.sigmas_.append(pca.explained_variance_) + self.ranks_.append(rank) + self.means_.append(pca.mean_) + + def _add_subspaces(self,subspaces,sigmas=None,means=None,rtol=1e-05, atol=1e-07): + if sigmas is None: + sigmas = np.repeat(None,len(subspaces)) + if means is None: + means = np.repeat(None,len(subspaces)) + for X,sigma,mean in zip(subspaces,sigmas,means): + self._check_point(X,rtol=rtol,atol=atol) + self.subspaces_.append(X) + self.sigmas_.append(sigma) + self.ranks_.append(X.shape[1]) + self.means_.append(mean) + + def _store_attributes(self): + self.n_points_ = len(self.subspaces_) + self.ranks_ = [s.shape[1] for s in self.subspaces_] + self.unique_ranks_ = np.unique(self.ranks_) + self.max_rank_ = max(self.ranks_) \ No newline at end of file diff --git a/skdim/subspace/moments.py b/skdim/subspace/moments.py new file mode 100644 index 0000000..aa5ac8b --- /dev/null +++ b/skdim/subspace/moments.py @@ -0,0 +1,160 @@ +import numpy as np + +def flag_mean(subspaces): + ''' Flag mean of subspaces + + Code taken from "Finding the subspace mean or median to fit your need." + T. Marrinan, J. R. Beveridge, B. Draper, M. Kirby, and C. Peterson. + Proc. IEEE Conference on Computer Vision and Pattern Recognition (CVPR), (2014): 1082-1089. + + Parameters + ---------- + subspaces: list[np.array] + Collection of subspaces of shape (n x rank[i]) to average + + Returns + ------- + basis: np.array + Flag mean basis + s: np.array + Flag mean singular values + ''' + + A_prime = np.concatenate(subspaces,axis=1) + flip = A_prime.shape + if flip[0] >= flip[1]: + u, s, vt = np.linalg.svd(A_prime,0) + mu = u + else: + v, s, uT = np.linalg.svd(A_prime.T,0) + mu = uT.T + + return mu, s + + + +def irls_flag(subspaces, r, n_its=100, sin_cos='sine', opt_err = 'sine', init = 'random', seed = 0): + ''' Code taken from https://github.com/nmank/Flag_Median_and_FlagIRLS + Use FlagIRLS on subspaces to output a representative for a point in Gr(r,n) + which solves the input objection function + Repeats until iterations = n_its or until objective function values of consecutive + iterates are within 0.0000000001 and are decreasing for every algorithm (except increasing for maximum cosine) + Inputs: + subspaces - list of numpy arrays representing points on Gr(k_i,n) + r - the number of columns in the output + n_its - number of iterations for the algorithm + sin_cos - a string defining the objective function for FlagIRLS + 'sine' = flag median + opt_err - string for objective function values in err (same options as sin_cos) + init - string 'random' for random initlalization. + otherwise input a numpy array for the inital point + seed - seed for random initialization, for reproducibility of results + Outputs: + Y - a numpy array representing the solution to the chosen optimization algorithm + err - a list of the objective function values at each iteration (objective function chosen by opt_err) + ''' + err = [] + n = subspaces[0].shape[0] + + + #initialize + if init == 'random': + #randomly + np.random.seed(seed) + Y_raw = np.random.rand(n,r)-.5 + Y = np.linalg.qr(Y_raw)[0][:,:r] + elif init == 'subspaces': + np.random.seed(seed) + Y = subspaces[np.random.randint(len(subspaces))] + else: + Y = init + + err.append(_calc_error_1_2(subspaces, Y, opt_err)) + + #flag mean iteration function + #uncomment the commented lines and + #comment others to change convergence criteria + + itr = 1 + diff = 1 + while itr <= n_its and diff > 0.0000000001: + Y0 = Y.copy() + Y = _flag_mean_iteration(subspaces, Y, sin_cos) + err.append(_calc_error_1_2(subspaces, Y, opt_err)) + diff = err[itr-1] - err[itr] + + itr+=1 + + + + if diff > 0: + return Y, err + else: + return Y0, err[:-1] + +def _flag_mean_iteration(subspaces, Y0, weight, eps = .0000001): + ''' Code taken from https://github.com/nmank/Flag_Median_and_FlagIRLS + Calculates a weighted Flag Mean of data using a weight method for FlagIRLS + eps = .0000001 for paper examples + Inputs: + subspaces - list of numpy arrays representing points on Gr(k_i,n) + Y0 - a numpy array representing a point on Gr(r,n) + weight - a string defining the objective function + 'sine' = flag median + 'sinsq' = flag mean + eps - a small perturbation to the weights to avoid dividing by zero + Outputs: + Y- the weighted flag mean + ''' + r = Y0.shape[1] + + aX = [] + al = [] + + ii=0 + + for x in subspaces: + if weight == 'sine': + m = np.min([r,x.shape[1]]) + sinsq = m - np.trace(Y0.T @ x @ x.T @ Y0) + al.append((sinsq+eps)**(-1/4)) + else: + print('unrecognized weight') + aX.append(al[-1]*x) + ii+= 1 + + Y = flag_mean(aX)[0][:,:r] + + return Y + +def _calc_error_1_2(subspaces, Y, sin_cos): + ''' Code taken from https://github.com/nmank/Flag_Median_and_FlagIRLS + Calculate objective function value. + Inputs: + subspaces - a list of numpy arrays representing points in Gr(k_i,n) + Y - a numpy array representing a point on Gr(r,n) + sin_cos - a string defining the objective function + 'sine' = Sine Median + 'sinsq' = Flag Mean + Outputs: + err - objective function value + ''' + k = Y.shape[1] + err = 0 + if sin_cos == 'sine': + for x in subspaces: + r = np.min([k,x.shape[1]]) + sin_sq = r - np.trace(Y.T @ x @ x.T @ Y) + #fixes numerical errors + if sin_sq < 0: + sin_sq = 0 + err += np.sqrt(sin_sq) + elif sin_cos == 'sinesq': + for x in subspaces: + r = np.min([k,x.shape[1]]) + sin_sq = r - np.trace(Y.T @ x @ x.T @ Y) + #fixes numerical errors + if sin_sq < 0: + sin_sq = 0 + err += sin_sq + return err \ No newline at end of file diff --git a/skdim/subspace/utils.py b/skdim/subspace/utils.py new file mode 100644 index 0000000..bfb63dd --- /dev/null +++ b/skdim/subspace/utils.py @@ -0,0 +1,216 @@ +import numpy as np +import pandas as pd +import sklearn +import scipy +import matplotlib.pyplot as plt +from scipy.spatial.distance import squareform +try: + from kneed import KneeLocator +except: + pass + +def draw_vector(v0, v1, ax=None,c='k',text=''): + ax = ax or plt.gca() + ax.annotate('', v1, v0, arrowprops=dict(arrowstyle='->', + linewidth=2, + shrinkA=0, shrinkB=0,color=c)) + ax.annotate(text, v1, v1+v1*.02, horizontalalignment="center") + +def kmeans_centroids(X,n_clusters=None): + ''' Downsample data by finding n_clusters points closest to kmeans centroids + sel_idx: index of selected representative points. Might be less than n_clusters if no points are assigned to some clusters + sel_part: points assignment to closest representative point, i.e., sel_part[1]=2 means X[0] is assigned to X[sel_idx[2]]) + ''' + if n_clusters is None: + n_clusters = min(int(len(X)*.1),5000) + + centroids = sklearn.cluster.KMeans(n_clusters=n_clusters).fit(X).cluster_centers_ + cpart,dis = scipy.cluster.vq.vq(X,centroids) + upart = np.unique(cpart) + ppart = np.zeros(n_clusters,dtype=int) + for p in (upart): + _ix = np.where(cpart==p)[0] + if len(_ix): + ppart[p] = _ix[dis[_ix].argmin()] + else: + ppart[p] = -1 + + sel_idx = [i for i in ppart if i != -1] + sel_part = scipy.cluster.vq.vq(X,X[sel_idx])[0] + return sel_idx,sel_part + +def upsample_distance_matrix_pdist(sD,part): + triu_ix = np.array(np.triu_indices(len(sD),1)).T + pdist = np.zeros(int(len(part)*(len(part)-1)/2)) + for i,j in triu_ix: + rows = np.where(part==i)[0] + cols = np.where(part==j)[0] + flat_idx = np.ravel_multi_index(np.ix_(rows,cols),(len(part),len(part))) + pdist[flat_idx] = sD[i,j] + return pdist + +def upsample_distance_matrix_squareform(sD,part): + + triu_ix = np.array(np.triu_indices(len(sD),1)).T + D = scipy.sparse.lil_matrix((len(part),len(part))) + for i,j in triu_ix: + rows = part==i + cols = part==j + + D[np.ix_(rows,cols)] = D[np.ix_(cols,rows)] = sD[i,j] + return D + +def upsample_labels(labels,part): + labels_all = np.zeros(len(part),dtype=int) + upart = np.unique(part) + for p in upart: + labels_all[part==p]=labels[p] + return labels_all + +def find_orth(O): + ''' + Find and concatenate a unit vector orthogonal + to all column vectors of provided subspace + ''' + O_n_plus_1 = np.vstack((O,np.zeros(O.shape[1]))) + rand_vec = np.random.rand(O_n_plus_1.shape[0], 1) + A = np.hstack((O_n_plus_1, rand_vec)) + b = np.zeros(O_n_plus_1.shape[1] + 1) + b[-1] = 1 + orth_vec = np.linalg.lstsq(A.T, b)[0] + orth_vec /= np.linalg.norm(orth_vec) + A[:,-1] = orth_vec + return A + +def graff2grass(subspace,mean): + '''Graff(k,n) to Gr(k+1,n+1) + + Returns + ------- + subspace_n_plus_1: np.array + Subspace in R^n+1 with mean added to the last basis vector + grassmann_basis: np.array + Gr(k+1,n+1) basis corresponding to the input Graff(k,n) basis + ''' + k = subspace.shape[1] + # Append new vector orthogonal to subspace basis (n->n+1, k->k+1) + subspace_n_plus_1 = find_orth(subspace) + # Add mean to shift the new basis vector + subspace_n_plus_1[:,-1] += np.append(mean,0.) + u,s,vh=np.linalg.svd(subspace_n_plus_1) + grassmann_basis = u[:,:k+1] + return subspace_n_plus_1, grassmann_basis + + +def refine_clustering(labels,knnidx,k=20,min_size=0,smooth=True): + + clus_min_size = labels.copy() + if min_size > 0: + ulabels = np.unique(labels) + counts = np.bincount(labels,minlength=ulabels.max()+1) + for i,ix in enumerate(labels): + xi = np.where(labels==ix)[0] + if len(xi)