diff --git a/.gitignore b/.gitignore index 39dcc8a554d1be3f8a4216dab35124b50d98b21b..bee1ff36b3bcf32022297bafb2a085d2c4021f70 100644 --- a/.gitignore +++ b/.gitignore @@ -144,4 +144,11 @@ cython_debug/ nohup.out ## vscode config -.vscode/ \ No newline at end of file +.vscode/ + +.test.py + +## directories that outputs when running the tests +tests/PLN* + +docs/source/* diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2463566c7ec83d3464c4ed38ace045cf1f99fe87..b99960db44d8ba2a5e74108f68a44f3bc6e2bffd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -29,8 +29,6 @@ publish_package: - TWINE_PASSWORD=${pypln_token} TWINE_USERNAME=__token__ python -m twine upload dist/* tags: - docker - only: - - main only: - tags diff --git a/README.md b/README.md index 266c4fe18ea1986a16d53ff1ddacba486b83f8e9..e98669f64e980c17448f4a166c732fda3813ece4 100644 --- a/README.md +++ b/README.md @@ -1,339 +1,59 @@ -# Usage and main fitting functions -### Description of the package +# PLNmodels: Poisson lognormal models -The package implements 3 differents classes that fits a PLN-PCA model (described in the mathematical description above). Basically, it tries to find the correlation between features and the effect of covariables on these features. As main characteristic, this model takes into account count data. -- The fastPLN class fits a PLN model (a special PLN-PCA model) using variational approximation. -- The fastPLNPCA class fits a PLN-PCA using variational approximation. -- The IMPS_PLN fits a PLN-PCA model using Importance sampling. +> The Poisson lognormal model and variants can be used for analysis of mutivariate count data. +> This package implements +> efficient algorithms to fit such models. +<!-- accompanied with a set of --> +<!-- > functions for visualization and diagnostic. See [this deck of --> +<!-- > slides](https://pln-team.github.io/slideshow/) for a --> +<!-- > comprehensive introduction. --> -IMPS_PLN is always slower than fastPLN. fastPLNPCA is faster than fastPLN only for datasets with very large number of genes (p>5000, see [here](https://github.com/PLN-team/PLNpy/blob/master/images/Comparison_fastPLN_vs_fastPLNPCA_n%3D1000.png)). However, -fastPLNPCA is convenient since it allows to get the Principal Components (PCs) directly, in contrary to fastPLN. To get the PCs using fastPLN, you first need to fit the model and do a PCA on the matrix $\Sigma$ found. The numerical complexity is always linear with respect to the number of samples n_samples (see [here](https://github.com/PLN-team/PLNpy/blob/master/images/Comparison_fastPLN_vs_fastPLNPCA_p%3D1000.png)) -All of these class are aggregated into the class PLNmodel, so that you don't need to deal with multiple classes. This class will automatically fit the data with one of those classes. +## Installation -### How to use the package? +**PLNmodels** is available on +[pypi](https://pypi.org/project/pyPLNmodels/). The development +version is available on [GitHub](https://github.com/PLN-team/pyPLNmodels). -First, you need to pip install the package. We recommend to create a new environment before installing the package. +### Package installation ``` pip install pyPLNmodels ``` -The package comes with an artificial dataset to present the functionality. You can load it doing the following: +## Usage and main fitting functions +The package comes with an ecological data set to present the functionality ``` -import pandas as pd -Y = pd.read_csv('example_data/Y_test') -O = pd.read_csv('example_data/O_test') -cov = pd.read_csv('example_data/cov_test') +import pyPLNmodels +from pyPLNmodels.models import PLNPCA, PLN +from pyPLNmodels.oaks import load_oaks +oaks = load_oaks() ``` -If you want $q$ Principal Composants, you only need to call: +### Unpenalized Poisson lognormal model (aka PLN) ``` -from pyPLNmodels.models import PLNmodel -nbpcs = 5 # number of principal components -mypln = PLNmodel(q= nbpcs) -mypln.fit(Y,O,cov) -print(mypln) +pln = PLN("counts ~ 1 + tree + dist2ground + orientation ", data = oaks, take_log_offsets = True) +pln.fit() +print(pln) ``` -Note that if you do not specify $q$, it will take the maximum possible value. You can look for a better approximation by setting ```fast = False ``` in the ```.fit()``` method, but it will take much more time: -``` -nbpcs = 5 -mypln = PLNmodel(nbpcs) -mypln.fit(Y,O,cov, fast = False) -print(mypln) -``` - - - -###### How to get the model parameters back ? - -You can get the model parameters back running: - -``` -beta = mypln.get_beta() -C = mypln.get_C() -Sigma = mypln.get_Sigma() -``` - - - - -<strong>This class automatically picks the right model among ```fastPLN, fastPLNPCA``` and ```IMPS_PLN```. If you want to know more about each of these algorithms that are quite different, you can check below.</strong> - -### How to fit each model? - -#### Fit the PLN model - - -You have to call : - -``` -from pyPLNmodels.models import fastPLN -fast = fastPLN() -fast.fit(Y,O,cov) -print(fast) -``` - - - -##### Hyperparameters - -Here are the main hyperparameters of the ```.fit()``` method of the ```fastPLN``` object: -- ```N_iter_max```: The maximum number of iteration you are ready to do. If the algorithm has not converged, you can try to increase it. Default is 200. -- ```tol```: tolerance of the model. The algorithm will stop if the ELBO (approximated likelihood of the model) has not increased of more than ```tol```. Try to decrease it if the algorithm has not converged. Default is ```1e-1``` -- ```good_init```: If set to ```True```, the algorithm will do an initialization that can take some time, especially for large datasets. You can set it to ```False``` if you want a much faster but random initialization. Default is ```True```. - -Those 3 parameters are important. However, they won't change the asymptotic behavior of the algorithm. If you launch the algorithm for a sufficient time (i.e. ```tol``` is small enough and ```N_iter_max``` is big enough), it will converge to the right parameters independently of the hyperparameters. Moreover, the default arguments are convenient for most datasets. -If you want to see the progress of the algorithm in real time, you can set ```Verbose = True``` in the .```fit()``` method. - -##### How to be sure the algorithm has converged ? - -Basically, if the ELBO reaches a plateau, the algorithm has converged. If it has not reached a plateau, then you can try to increase the number of iteration ```N_iter_max``` or lower the tolerance ```tol```. - -Note that you don't need to restart the algorithm from the beginning, you can start from where the algorithm has stopped by calling: - -``` -fast.fit(Y,O,cov, N_iter_max = 500, tol = 1e-5) -``` -The numerical complexity is quadratic with respect to the number of genes p. - -### Fit the fastPLNPCA model - -To fit the ```fastPLNPCA``` object, you first need to declare the number of PCs you want, and then you can fit the object: +### Rank Constrained Poisson lognormal for Poisson Principal Component Analysis (aka PLNPCA) ``` -from pyPLNmodels.models import fastPLNPCA -nbpcs = 5 -fastpca = fastPLNPCA(q=nbpcs) -fastpca.fit(Y,O,cov) -print(fastpca) +pca = PLNPCA("counts ~ 1 + tree + dist2ground + orientation ", data = oaks, take_log_offsets = True, ranks = [3,4,5]) +pca.fit() +print(pca) ``` - - -The hyperparameters of the ```.fit()``` method are the same as for the ```fastPLN``` object. Only the Default values of ```N_iter_max ``` and ```tol``` are differents: - -- ```N_iter_max ``` default is : 5000 -- ```tol ``` default is : 1e-3 - -You can check if the algorithm has converged following the same guidelines as for ```fastPLN```. -The numerical complexity is linear with respect to the number of genes p. - -### Fit the IMPS_PLN model - - -To fit the IMPS based model, you need to declare the number of Principal composents, and then you can fit the model: -``` -from pyPLNmodels.models import IMPS_PLN -nbpcs = 5 -imps = IMPS_PLN(nbpcs) -imps.fit(Y,O,cov) -print(imps) -``` - - - - - - -##### Hyperparameters - -The hyperparameters of the ```.fit()``` method of the ```IMPS_PLN``` are more complicated and technical. We suggest to take a look at the mathematical description of the package to gain intuition. Basically, the ```IMPS_PLN``` estimates the gradients of the log likelihood with importance sampling. Here are the main hyperparameters and their impacts: -- ```acc```: the accuracy of the approximation. The lower the better the gradient approximation, but the lower the algorithm. Default is 0.005 You can try to increasing it if you want to be faster. However reducing it won't gain much accuracy, and will significantly increase the convergence time. -- ``` N_epoch_max```: The maximum number of iteration you are ready to do. If the tolerance has not converged, you can try to increase it. Default is 500. -- ```lr```: Learning rate of the gradient ascent. You can try to reduce it or lower it, and see if the final likelihood has improved. Default is 0.1. -- ```batch_size```: The batch size of the gradient descent. The larger the more accurate the gradients, but the slower the algorithm. if you have very large datasets, you can try to increase it. If you decrease it, then you hsould also decrease the learning rate. Default is 40. Should not exceed the number of samples you have in your dataset. -- ```optimizer```: The optimizer you take for the gradient ascent. You can try ```torch.optim.RMSprop```, which is more robust to inappropriate learning rates. However lower the learning rate to 0.01 if using ```torch.optim.RMSprop```. Default is ```torch.optim.Adagrad```. -- ```nb_plateau```: The algorithm will stop if the likelihood of the model has not increased during ```nb_plateau``` epochs. Default is 15. -- ```nb_trigger```: Since the likelihood is approximated and random, we consider that the likelihood does not increase if during ```nb_trigger``` iterations it has not improved from the maximum likelihood computed. This parameter is here to deal with the randomness of the criterion. Default is 5. -- ```good_init```: If set to ```True```, the algorithm will do a precise initialization (that takes some time). You can remove this step by setting ```good_init = False ```. Default is True. - -You can see the progress of the algorithm in real time by setting ```verbose = True``` in the ```.fit()``` method. -The numerical complexity is linear with respect to the number of genes p. - -##### How to be sure the algorithm has converged ? - -Unfortunately, there is no heuristics to know if the algorithm has converged. Indeed, even if you reach a plateau, it is possible that you can reach a much better plateau with other hyperparameters. However, this is in fact due to the choice of ```torch.optim.Adagrad``` as optimizer. If it has converged, it will be a very good solution. To have a (fast) convergence, you need to take the learning rate in the right interval. Fortunately, it is quite large: about ```[0.01, 0.3]``` for many cases. - -If you have still not converged, you can try to change the optimizer to ```torch.optim.RMSprop```, but lower the learning to 0.02 or lower. You can also increase the batch_size and the number of iteration you do. If your dataset is not too big, as a last resort, you can try to set the learning rate to 0.1, taking as optimizer ```torch.optim.Rprop``` and set the ```batch_size``` to the number of samples you have in your dataset. - - -### How to retrieve the parameters of the model ? - -After fitting the model, one can retrieve the parameters of the model. To retrieve $\beta$, you only need to call: - -```beta_chap = model.get_beta() ``` - -To retrieve $\Sigma$, you only need to call: - -```Sigma_chap = model.get_Sigma()``` - -Note that for the PCA models, this matrix won't be invertible. - -To retrieve $C$, you only need to call: - -```C_chap = model.get_C()``` - -For the fastPLN object, you will get a Matrix of size $(p,p)$ containing the eigenvectors of $\Sigma$ numberred progressively from the eigenvectors with largest eigenvalue to the lowest. - - - -## Quick mathematical description of the package. - -The package tries to infer the parameters of two models: - -- Poisson Log Normal-Principal Composent Analysis model (PLN-PCA) -- Poisson Log Normal model (PLN) (special case of PLN-PCA model) - - - - -We consider the following model PLN-PCA model: - -- Consider $n$ samples $(i=1 \ldots n)$ - -- Measure $x_{i}=\left(x_{i h}\right)_{1 \leq h \leq d}$ : -$x_{i h}=$ (covariate) for sample $i$ (altitude, temperature, categorical covariate, ...) - -- Consider $p$ features (genes) $(j=1 \ldots p)$ Measure $Y=\left(Y_{i j}\right)_{1 \leq i \leq n, 1 \leq j \leq p}$ : - -- Measure $Y = Y_{i j}=$ number of times the feature $j$ is observed in sample $i$. - -- Associate a random vector $Z_{i}$ with each sample. -- Assume that the unknown $\left(W_{i}\right)_{1 \leq i \leq n}$ are independant and living in a space of dimension $q\leq p$ such that: - -$$ -\begin{aligned} -W_{i} & \sim \mathcal{N}_p\left(0, I_{q}\right) \\ -Z_{i} &=\beta^{\top}\mathbf{x}_{i} +\mathbf{C}W_i \in \mathbb R^p \\ -Y_{i j} \mid Z_{i j} & \sim \mathcal{P}\left(\exp \left(o_{ij} + Z_{i j}\right)\right) -\end{aligned} -$$ - -and $C\in \mathbb R^{p\times q}$, $\beta \in \mathbb R^{d\times p}$. - -Where $O = (o_{ij})_{1\leq i\leq n, 1\leq j\leq p}$ are known offsets. - -We can see that - -$$Z_{i} \sim \mathcal N_p (\beta^{\top}\mathbf{x}_{i}, \Sigma) $$ - -The unknown parameter is $\theta = (\Sigma,\beta)$. The latent variable of the model can be seen as $Z$ or $W$. - - -- When $p=q$, we call this model Poisson-Log Normal (PLN) model. In this case, $Z_i$ is a non-degenerate gaussian with mean $\beta^{\top}\mathbf{x}_{i} \in \mathbb R^p$ and covariance matrix $\Sigma$. -- When $p<q$, we call this model Poisson-Log Normal-Principal Component Analysis (PLN-PCA). Indeed, we are doing a PCA in the latent layer, estimating $\Sigma$ with a ranq $q$ matrix: $CC^{\top}$. - -The goal of this package is to retrieve $\theta$ from the observed data $(Y, O, X)$. To do so, we will try to maximize the log likelihood of the model: -$$p_{\theta}(Y_i) = \int_{\mathbb R^q} p_{\theta}(Y_i,W)dW \overset{\text{ (if } p=q\text{)}}{=} \int_{\mathbb R^p} p_{\theta}(Y_i,Z)dZ$$ - -However, almost any integrals involving the law of the complete data is unreachable, so that we can't perform neither gradient ascent algorithms nor EM algorithm. -We adopt two different approaches to circumvent this problem: -- Variational approximation of the latent layer (Variational EM) -- Importance sampling based algorithm, using a gradient ascent method. - - - - - - - - -## Variational approach - -We want here to use the EM algorithm, but the E step is unreachable, since the law $Z|Y_i$ (resp $W|Y_i$) is unknown and can't be integrated out. We thus choose to approximate the law of $Z|Y_i$ (resp $W|Y_i$) with a law $\phi_i(Z)$ (resp $\phi_i(W)$), where $\phi_i$ is taken among a family of law. We thus change the objective function: - -$$\begin{align} J_Y(\theta,\phi) & = \frac 1 n \sum _{i = 1}^n J_{Y_i}(\theta, \phi_i) \\ -J_{Y_i}(\theta, \phi_i)& =\log p_{\theta}(Y_i)-K L\left[\phi_i(Z_i) \|p_{\theta}(Z_i \mid Y_i)\right]\\ -& = \mathbb{E}_{\phi_i}\left[\log p_{\theta}(Y_i, Z_i)\right] \underbrace{-\mathbb{E}_{\phi_i}[\log \phi_i(Z_i)]}_{\text {entropy } \mathcal{H}(\phi_i)} \\ -\end{align}$$ - - -We choose $\phi_i$ in a family distribution : - -$$ -\phi_i \in \mathcal{Q}_{\text {diag}}=\{ - \mathcal{N}\left(M_{i}, \operatorname{diag} (S_{i}\odot S_i )) -, M_i \in \mathbb{M} ^q, S_i \in \mathbb{R} ^q\right\} -$$ - -We choose such a Gaussian approximation since $W$ is gaussian, so that $W|Y_i$ may be well approximated. However, taking a diagonal matrix as covariance breaks the dependecy induced by $Y_i$. - -We can prove that $J_{Y_i}(\theta, \phi_i) \leq p_{\theta} (Y_i) \; \forall \phi_i$. The quantity $J_{Y}(\theta, \phi)$ is called the ELBO (Evidence Lower BOund). - -#### Variational EM - -Given an intialisation $(\theta^0, q^0)$, the variational EM aims at maximizing the ELBO alternating between two steps: - -- VE step: update $q$ -$$ -q^{t+1}=\underset{q \in \mathcal{Q}_{gauss}}{\arg \max } J_Y(\theta^{t}, q) -$$ -- M step : update $\theta$ -$$ -\theta^{t+1}=\underset{\theta}{\arg \max } J_Y(\theta, q^{t+1}) -$$ -Each step is an optimisation problem that needs to be solved using analytical forms or gradient ascent. Note that $q$ is completely determined by $M = (M_i)_{1 \leq i \leq n } \in \mathbb R ^{n\times q}$ and $S = (S_i)_{1 \leq i \leq n } \in \mathbb R ^{n\times q}$, so that $J$ is a function of $(M, S, \beta, \Sigma)$. $q = (M,S)$ are the variational parameters, $\theta = (\beta, \Sigma$) are the model parameters. - - -##### Case $p = q$ -The case $p=q$ does not perform dimension reduction, but is very fast to compute. -Indeed, computations show that the M-step is straightforward in this case as we can update $\Sigma$ and $\beta$ with an analytical form : - -$$ -\begin{aligned} -\Sigma^{(t+1)} & = \frac{1}{n} \sum_{i}\left(\left((M^{(t)}-X\beta)_{i} (M^{(t)}-X\beta)_{i}\right)^{\top}+S^{(t)}_{i}\right)\\ -\beta^{(t+1)} &= (X^{\top}X)^{-1}X^{\top}M^{(t)} \\ -\end{aligned} -$$ -This results in a fast algorithm, since we only need to go a gradient ascent on the variational parameters $M$ and $S$. Practice shows that we only need to do one gradient step of $M$ and $S$, update $\beta$ and $\Sigma$ with their closed form, then re-perform a gradient step on $M$ and $S$ and so on. - - -##### Case $p <q$ - -When $p<q$, we do not have any analytical form and are forced to perform gradient ascent on all the parameters. Practice shows that we can perform a gradient ascent on all the parameters at a time (doing each VE step and M step perfectly is quite inefficient). - - - - -## Importance sampling based algorithm - -In this section, we try to estimate the gradients with respect to $\theta = (C, \beta) $. - -One can use importance sampling to estimate the likelihood: - - $$p_{\theta}(Y_i) = \int \tilde p_{\theta}^{(u)}(W) \mathrm dW \approx \frac 1 {n_s} \sum_{k=1}^{n_s} \frac {\tilde p_{\theta}^{(u)}(V_k)}{g(V_k)}, ~ ~ ~(V_{k})_{1 \leq k \leq n_s} \overset{iid}{\sim} g$$ - -where $g$ is the importance law, $n_s$ is the sampling effort and - - -$$\begin{array}{ll} -\tilde p_{\theta}^{(u)}\ :& \mathbb R^{q} \to \mathbb R^+ \\ - & W \mapsto p_{\theta}(Y_i| W) p(W) = \exp \left( - \frac 12 \| W_i\|^2 - \mathbf{1}_p^{\top} \exp(O_i + \beta^{\top}X_i + CW_i) + Y_i^{\top}(O_i + \beta^{\top}X_i +CW_i)\right) \\ -\end{array}$$ - -To learn more about the (crucial) choice of $g$, please see [Memoire PLN](Memoire_PLN_21_01_2022.pdf), section 3.2.3. - -One can do the following approximation: - - - $$\begin{equation}\label{one integral} - \nabla _{\theta} \operatorname{log} p_{\theta}(Y_i) \approx \nabla_{\theta} \operatorname{log}\left(\frac 1 {n_s} \sum_{k=1}^{n_s} \frac {\tilde p_{\theta}^{(u)}(V_k)}{g(V_k)}\right)\end{equation}$$ - -And derive the gradients formula: - -$$\nabla_{\beta} \operatorname{log} p_{\theta}(Y_i)\approx X_iY_i^{\top} -\frac{\sum_{i = 1}^{n_s}\frac{\tilde p_{\theta}(V_k)}{g(V_k)}X_i\operatorname{exp}(O_i + \beta^{\top}X_i + CV_k)^{\top}}{\sum_{i = 1}^{n_s}\frac{\tilde p_{\theta}(V_k)}{g(V_k)}} $$ - -$$\nabla_{C} \operatorname{log} p_{\theta}(Y_i)\approx \frac{\sum_{i = 1}^{n_s}\frac{\tilde p_{\theta}(V_k)}{g(V_k)}\left[Y_{i}- \exp \left(O_i + \beta^{\top} X_{i}+CV_{k}{ }\right)\right] V_{k}^{\top}}{\sum_{i = 1}^{n_s}\frac{\tilde p_{\theta}(V_k)}{g(V_k)}} $$ -$$$$ - +## References -Given the estimated gradients, we can run a gradient ascent to increase the likelihood. -We use algorithm of Variance reduction such as SAGA, SAG or SVRG, implemented in the VR.py file. +Please cite our work using the following references: +- J. Chiquet, M. Mariadassou and S. Robin: Variational inference for + probabilistic Poisson PCA, the Annals of Applied Statistics, 12: + 2674–2698, 2018. [link](http://dx.doi.org/10.1214/18%2DAOAS1177) diff --git a/pyPLNmodels/_closed_forms.py b/pyPLNmodels/_closed_forms.py index 0dc9e7fab2b975949f515733508f915b103bf284..8c0ec8d200e1e27b89f5bdf7a30b8047ae07b8e7 100644 --- a/pyPLNmodels/_closed_forms.py +++ b/pyPLNmodels/_closed_forms.py @@ -1,7 +1,7 @@ import torch # pylint:disable=[C0114] -def closed_formula_covariance(covariates, latent_mean, latent_var, coef, n_samples): +def _closed_formula_covariance(covariates, latent_mean, latent_var, coef, n_samples): """Closed form for covariance for the M step for the noPCA model.""" if covariates is None: XB = 0 @@ -14,15 +14,15 @@ def closed_formula_covariance(covariates, latent_mean, latent_var, coef, n_sampl return closed / n_samples -def closed_formula_coef(covariates, latent_mean): +def _closed_formula_coef(covariates, latent_mean): """Closed form for coef for the M step for the noPCA model.""" if covariates is None: return None return torch.inverse(covariates.T @ covariates) @ covariates.T @ latent_mean -def closed_formula_pi( +def _closed_formula_pi( offsets, latent_mean, latent_var, dirac, covariates, _coef_inflation ): poiss_param = torch.exp(offsets + latent_mean + 0.5 * torch.square(latent_var)) - return torch.sigmoid(poiss_param + torch.mm(covariates, _coef_inflation)) * dirac + return torch._sigmoid(poiss_param + torch.mm(covariates, _coef_inflation)) * dirac diff --git a/pyPLNmodels/_utils.py b/pyPLNmodels/_utils.py index 0e079ce52f357a6be07403d83c44734add074dd7..3a8493f3868bf28d6dec286c9aa6927b1efab43b 100644 --- a/pyPLNmodels/_utils.py +++ b/pyPLNmodels/_utils.py @@ -19,18 +19,18 @@ else: DEVICE = torch.device("cpu") -class PLNPlotArgs: +class _PlotArgs: def __init__(self, window): self.window = window self.running_times = [] self.criterions = [1] * window - self.elbos_list = [] + self._elbos_list = [] @property def iteration_number(self): - return len(self.elbos_list) + return len(self._elbos_list) - def show_loss(self, ax=None, name_doss=""): + def _show_loss(self, ax=None, name_doss=""): """Show the ELBO of the algorithm along the iterations. args: @@ -46,17 +46,17 @@ class PLNPlotArgs: ax = plt.gca() ax.plot( self.running_times, - -np.array(self.elbos_list), + -np.array(self._elbos_list), label="Negative ELBO", ) - last_elbos = np.round(self.elbos_list[-1], 6) + last_elbos = np.round(self._elbos_list[-1], 6) ax.set_title(f"Negative ELBO. Best ELBO ={last_elbos}") ax.set_yscale("log") ax.set_xlabel("Seconds") ax.set_ylabel("ELBO") ax.legend() - def show_stopping_criterion(self, ax=None, name_doss=""): + def _show_stopping_criteration(self, ax=None): """Show the criterion of the algorithm along the iterations. args: @@ -83,7 +83,7 @@ class PLNPlotArgs: ax.legend() -def init_covariance(counts, covariates, coef): +def _init_covariance(counts, covariates, coef): """Initialization for covariance for the PLN model. Take the log of counts (careful when counts=0), remove the covariates effects X@coef and then do as a MLE for Gaussians samples. @@ -102,7 +102,7 @@ def init_covariance(counts, covariates, coef): return sigma_hat -def init_components(counts, covariates, coef, rank): +def _init_components(counts, covariates, coef, rank): """Inititalization for components for the PLN model. Get a first guess for covariance that is easier to estimate and then takes the rank largest eigenvectors to get components. @@ -115,12 +115,12 @@ def init_components(counts, covariates, coef, rank): Returns : torch.tensor of size (p,rank). The initialization of components. """ - sigma_hat = init_covariance(counts, covariates, coef).detach() - components = components_from_covariance(sigma_hat, rank) + sigma_hat = _init_covariance(counts, covariates, coef).detach() + components = _components_from_covariance(sigma_hat, rank) return components -def init_latent_mean( +def _init_latent_mean( counts, covariates, offsets, coef, components, n_iter_max=500, lr=0.01, eps=7e-3 ): """Initialization for the variational parameter M. Basically, @@ -160,8 +160,8 @@ def init_latent_mean( return mode -def sigmoid(tens): - """Compute the sigmoid function of x element-wise.""" +def _sigmoid(tens): + """Compute the _sigmoid function of x element-wise.""" return 1 / (1 + torch.exp(-tens)) @@ -212,7 +212,7 @@ def sample_pln(components, coef, covariates, offsets, _coef_inflation=None, seed # nan=0, neginf=0, posinf=0) -def components_from_covariance(covariance, rank): +def _components_from_covariance(covariance, rank): """Get the best matrix of size (p,rank) when covariance is of size (p,p). i.e. reduces norm(covariance-components@components.T) Args : @@ -231,15 +231,15 @@ def components_from_covariance(covariance, rank): return requested_components -def init_coef(counts, covariates, offsets): +def _init_coef(counts, covariates, offsets): if covariates is None: return None - poiss_reg = PoissonReg() + poiss_reg = _PoissonReg() poiss_reg.fit(counts, covariates, offsets) return poiss_reg.beta -def log_stirling(integer): +def _log_stirling(integer): """Compute log(n!) even for n large. We use the Stirling formula to avoid numerical infinite values of n!. Args: @@ -280,22 +280,22 @@ def log_posterior(counts, covariates, offsets, posterior_mean, components, coef) - 1 / 2 * torch.norm(posterior_mean, dim=-1) ** 2 ) second_term = torch.sum( - -torch.exp(log_lambda) + log_lambda * counts - log_stirling(counts), axis=-1 + -torch.exp(log_lambda) + log_lambda * counts - _log_stirling(counts), axis=-1 ) return first_term + second_term -def trunc_log(tens, eps=1e-16): +def _trunc_log(tens, eps=1e-16): integer = torch.min(torch.max(tens, torch.tensor([eps])), torch.tensor([1 - eps])) return torch.log(integer) -def get_offsets_from_sum_of_counts(counts): +def _get_offsets_from_sum_of_counts(counts): sum_of_counts = torch.sum(counts, axis=1) return sum_of_counts.repeat((counts.shape[1], 1)).T -def raise_wrong_dimension_error( +def _raise_wrong_dimension_error( str_first_array, str_second_array, dim_first_array, dim_second_array, dim_of_error ): msg = ( @@ -306,11 +306,11 @@ def raise_wrong_dimension_error( raise ValueError(msg) -def check_two_dimensions_are_equal( +def _check_two_dimensions_are_equal( str_first_array, str_second_array, dim_first_array, dim_second_array, dim_of_error ): if dim_first_array != dim_second_array: - raise_wrong_dimension_error( + _raise_wrong_dimension_error( str_first_array, str_second_array, dim_first_array, @@ -319,7 +319,7 @@ def check_two_dimensions_are_equal( ) -def init_S(counts, covariates, offsets, beta, C, M): +def _init_S(counts, covariates, offsets, beta, C, M): n, rank = M.shape batch_matrix = torch.matmul(C.unsqueeze(2), C.unsqueeze(1)).unsqueeze(0) CW = torch.matmul(C.unsqueeze(0), M.unsqueeze(2)).squeeze() @@ -331,7 +331,7 @@ def init_S(counts, covariates, offsets, beta, C, M): return hess_posterior -def format_data(data): +def _format_data(data): if data is None: return None if isinstance(data, pd.DataFrame): @@ -345,25 +345,27 @@ def format_data(data): ) -def format_model_param(counts, covariates, offsets, offsets_formula): - counts = format_data(counts) +def _format_model_param(counts, covariates, offsets, offsets_formula, take_log_offsets): + counts = _format_data(counts) if covariates is not None: - covariates = format_data(covariates) + covariates = _format_data(covariates) if offsets is None: if offsets_formula == "logsum": print("Setting the offsets as the log of the sum of counts") offsets = ( - torch.log(get_offsets_from_sum_of_counts(counts)).double().to(DEVICE) + torch.log(_get_offsets_from_sum_of_counts(counts)).double().to(DEVICE) ) else: offsets = torch.zeros(counts.shape, device=DEVICE) else: - offsets = format_data(offsets).to(DEVICE) + offsets = _format_data(offsets).to(DEVICE) + if take_log_offsets is True: + offsets = torch.log(offsets) return counts, covariates, offsets -def remove_useless_intercepts(covariates): - covariates = format_data(covariates) +def _remove_useless_intercepts(covariates): + covariates = _format_data(covariates) if covariates.shape[1] < 2: return covariates first_column = covariates[:, 0] @@ -375,17 +377,17 @@ def remove_useless_intercepts(covariates): return covariates -def check_data_shape(counts, covariates, offsets): +def _check_data_shape(counts, covariates, offsets): n_counts, p_counts = counts.shape n_offsets, p_offsets = offsets.shape - check_two_dimensions_are_equal("counts", "offsets", n_counts, n_offsets, 0) + _check_two_dimensions_are_equal("counts", "offsets", n_counts, n_offsets, 0) if covariates is not None: n_cov, _ = covariates.shape - check_two_dimensions_are_equal("counts", "covariates", n_counts, n_cov, 0) - check_two_dimensions_are_equal("counts", "offsets", p_counts, p_offsets, 1) + _check_two_dimensions_are_equal("counts", "covariates", n_counts, n_cov, 0) + _check_two_dimensions_are_equal("counts", "offsets", p_counts, p_offsets, 1) -def nice_string_of_dict(dictionnary): +def _nice_string_of_dict(dictionnary): return_string = "" for each_row in zip(*([i] + [j] for i, j in dictionnary.items())): for element in list(each_row): @@ -394,7 +396,7 @@ def nice_string_of_dict(dictionnary): return return_string -def plot_ellipse(mean_x, mean_y, cov, ax): +def _plot_ellipse(mean_x, mean_y, cov, ax): pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1]) ell_radius_x = np.sqrt(1 + pearson) ell_radius_y = np.sqrt(1 - pearson) @@ -419,7 +421,7 @@ def plot_ellipse(mean_x, mean_y, cov, ax): return pearson -def get_components_simulation(dim, rank): +def _get_components_simulation(dim, rank): block_size = dim // rank prev_state = torch.random.get_rng_state() torch.random.manual_seed(0) @@ -457,7 +459,7 @@ def get_simulation_offsets_cov_coef(n_samples, nb_cov, dim): def get_simulated_count_data( n_samples=100, dim=25, rank=5, nb_cov=1, return_true_param=False, seed=0 ): - components = get_components_simulation(dim, rank) + components = _get_components_simulation(dim, rank) offsets, cov, true_coef = get_simulation_offsets_cov_coef(n_samples, nb_cov, dim) true_covariance = torch.matmul(components, components.T) counts, _, _ = sample_pln(components, true_coef, cov, offsets, seed=seed) @@ -484,13 +486,13 @@ def get_real_count_data(n_samples=270, dim=100): return counts -def closest(lst, element): +def _closest(lst, element): lst = np.asarray(lst) idx = (np.abs(lst - element)).argmin() return lst[idx] -class PoissonReg: +class _PoissonReg: """Poisson regressor class.""" def __init__(self): @@ -527,7 +529,7 @@ class PoissonReg: i = 0 grad_norm = 2 * tol # Criterion while i < Niter_max and grad_norm > tol: - loss = -compute_poissreg_log_like(Y, O, covariates, beta) + loss = -_compute_poissreg_log_like(Y, O, covariates, beta) loss.backward() optimizer.step() grad_norm = torch.norm(beta.grad) @@ -544,7 +546,7 @@ class PoissonReg: self.beta = beta -def compute_poissreg_log_like(Y, O, covariates, beta): +def _compute_poissreg_log_like(Y, O, covariates, beta): """Compute the log likelihood of a Poisson regression.""" # Matrix multiplication of X and beta. XB = torch.matmul(covariates.unsqueeze(1), beta.unsqueeze(0)).squeeze() @@ -552,7 +554,7 @@ def compute_poissreg_log_like(Y, O, covariates, beta): return torch.sum(-torch.exp(O + XB) + torch.multiply(Y, O + XB)) -def to_tensor(obj): +def _to_tensor(obj): if isinstance(obj, np.ndarray): return torch.from_numpy(obj) if isinstance(obj, torch.Tensor): @@ -562,7 +564,7 @@ def to_tensor(obj): raise TypeError("Please give either a nd.array or torch.Tensor or pd.DataFrame") -def check_dimensions_are_equal(tens1, tens2): +def _check_dimensions_are_equal(tens1, tens2): if tens1.shape[0] != tens2.shape[0] or tens1.shape[1] != tens2.shape[1]: raise ValueError("Tensors should have the same size.") @@ -611,7 +613,7 @@ def load_plnpca(path_of_directory, ranks=None): return datas -def check_right_rank(data, rank): +def _check_right_rank(data, rank): data_rank = data["latent_mean"].shape[1] if data_rank != rank: raise RuntimeError( @@ -620,24 +622,23 @@ def check_right_rank(data, rank): ) -def extract_data_from_formula(formula, data): +def _extract_data_from_formula(formula, data): dmatrix = dmatrices(formula, data=data) counts = dmatrix[0] covariates = dmatrix[1] - print("covariates size:", covariates.size) if covariates.size == 0: covariates = None offsets = data.get("offsets", None) return counts, covariates, offsets -def is_dict_of_dict(dictionnary): +def _is_dict_of_dict(dictionnary): if isinstance(dictionnary[list(dictionnary.keys())[0]], dict): return True return False -def get_dict_initialization(rank, dict_of_dict): +def _get_dict_initalization(rank, dict_of_dict): if dict_of_dict is None: return None return dict_of_dict[rank] diff --git a/pyPLNmodels/elbos.py b/pyPLNmodels/elbos.py index 082bcdfaa943478ed1cf9999ce8016256130f930..08d7a9818b2272ee89b56c8e565cdde0165b1bf8 100644 --- a/pyPLNmodels/elbos.py +++ b/pyPLNmodels/elbos.py @@ -1,6 +1,6 @@ import torch # pylint:disable=[C0114] -from ._utils import log_stirling, trunc_log -from ._closed_forms import closed_formula_covariance, closed_formula_coef +from ._utils import _log_stirling, _trunc_log +from ._closed_forms import _closed_formula_covariance, _closed_formula_coef def elbo_pln(counts, covariates, offsets, latent_mean, latent_var, covariance, coef): @@ -37,7 +37,7 @@ def elbo_pln(counts, covariates, offsets, latent_mean, latent_var, covariance, c + 0.5 * torch.log(s_rond_s) ) elbo -= 0.5 * torch.trace(torch.inverse(covariance) @ d_plus_minus_xb2) - elbo -= torch.sum(log_stirling(counts)) + elbo -= torch.sum(_log_stirling(counts)) elbo += 0.5 * n_samples * dim return elbo / n_samples @@ -62,8 +62,8 @@ def profiled_elbo_pln(counts, covariates, offsets, latent_mean, latent_var): n_samples, _ = counts.shape s_rond_s = torch.square(latent_var) offsets_plus_m = offsets + latent_mean - closed_coef = closed_formula_coef(covariates, latent_mean) - closed_covariance = closed_formula_covariance( + closed_coef = _closed_formula_coef(covariates, latent_mean) + closed_covariance = _closed_formula_covariance( covariates, latent_mean, latent_var, closed_coef, n_samples ) elbo = -0.5 * n_samples * torch.logdet(closed_covariance) @@ -72,7 +72,7 @@ def profiled_elbo_pln(counts, covariates, offsets, latent_mean, latent_var): - torch.exp(offsets_plus_m + s_rond_s / 2) + 0.5 * torch.log(s_rond_s) ) - elbo -= torch.sum(log_stirling(counts)) + elbo -= torch.sum(_log_stirling(counts)) return elbo / n_samples @@ -108,13 +108,13 @@ def elbo_plnpca(counts, covariates, offsets, latent_mean, latent_var, components mm_plus_s_rond_s = -0.5 * torch.sum( torch.square(latent_mean) + torch.square(latent_var) ) - log_stirlingcounts = torch.sum(log_stirling(counts)) + _log_stirlingcounts = torch.sum(_log_stirling(counts)) return ( counts_log_intensity + minus_intensity_plus_s_rond_s_cct + minuslogs_rond_s + mm_plus_s_rond_s - - log_stirlingcounts + - _log_stirlingcounts + 0.5 * n_samples * rank ) / n_samples @@ -162,12 +162,12 @@ def elbo_zi_pln( * ( counts @ offsets_plus_m - torch.exp(offsets_plus_m + s_rond_s / 2) - - log_stirling(counts), + - _log_stirling(counts), ) + pi ) - elbo -= torch.sum(pi * trunc_log(pi) + (1 - pi) * trunc_log(1 - pi)) + elbo -= torch.sum(pi * _trunc_log(pi) + (1 - pi) * _trunc_log(1 - pi)) elbo += torch.sum( pi * x_coef_inflation - torch.log(1 + torch.exp(x_coef_inflation)) ) diff --git a/pyPLNmodels/models.py b/pyPLNmodels/models.py index 99f99fe54535cff26a7960ef68c18181e77e7cce..22f43d92588884c5bf28156dea4a96a4a08c1799 100644 --- a/pyPLNmodels/models.py +++ b/pyPLNmodels/models.py @@ -16,30 +16,29 @@ from patsy import dmatrices from ._closed_forms import ( - closed_formula_coef, - closed_formula_covariance, - closed_formula_pi, + _closed_formula_coef, + _closed_formula_covariance, + _closed_formula_pi, ) from .elbos import elbo_plnpca, elbo_zi_pln, profiled_elbo_pln from ._utils import ( - PLNPlotArgs, - init_covariance, - init_components, - init_coef, - check_two_dimensions_are_equal, - init_latent_mean, - format_data, - format_model_param, - check_data_shape, - nice_string_of_dict, - plot_ellipse, - closest, - to_tensor, - check_dimensions_are_equal, - check_right_rank, - remove_useless_intercepts, - extract_data_from_formula, - get_dict_initialization, + _PlotArgs, + _init_covariance, + _init_components, + _init_coef, + _check_two_dimensions_are_equal, + _init_latent_mean, + _format_data, + _format_model_param, + _check_data_shape, + _nice_string_of_dict, + _plot_ellipse, + _closest, + _to_tensor, + _check_dimensions_are_equal, + _check_right_rank, + _extract_data_from_formula, + _get_dict_initalization, ) if torch.cuda.is_available(): @@ -58,11 +57,11 @@ class _PLN(ABC): Virtual class for all the PLN models. This class must be derivatived. The methods `get_covariance`, `compute_elbo`, - `random_init_latent_parameters` and `list_of_parameters_needing_gradient` must + `_random_init_latent_parameters` and `_list_of_parameters_needing_gradient` must be defined. """ - WINDOW = 15 + _WINDOW = 15 n_samples: int dim: int nb_cov: int @@ -70,7 +69,7 @@ class _PLN(ABC): _covariates: torch.Tensor _offsets: torch.Tensor _coef: torch.Tensor - beginnning_time: float + _beginning_time: float _latent_var: torch.Tensor _latent_mean: torch.Tensor @@ -82,19 +81,20 @@ class _PLN(ABC): offsets=None, offsets_formula="logsum", dict_initialization=None, + take_log_offsets=False, ): """ Simple initialization method wors fine. """ - self._counts, self._covariates, self._offsets = format_model_param( - counts, covariates, offsets, offsets_formula + self._counts, self._covariates, self._offsets = _format_model_param( + counts, covariates, offsets, offsets_formula, take_log_offsets ) - check_data_shape(self._counts, self._covariates, self._offsets) + _check_data_shape(self._counts, self._covariates, self._offsets) self._fitted = False - self.plotargs = PLNPlotArgs(self.WINDOW) + self._plotargs = _PlotArgs(self._WINDOW) if dict_initialization is not None: - self.set_init_parameters(dict_initialization) + self._set__init_parameters(dict_initialization) @__init__.register(str) def _( @@ -103,25 +103,33 @@ class _PLN(ABC): data: dict, offsets_formula="logsum", dict_initialization=None, + take_log_offsets=False, ): - counts, covariates, offsets = extract_data_from_formula(formula, data) - self.__init__(counts, covariates, offsets, offsets_formula, dict_initialization) + counts, covariates, offsets = _extract_data_from_formula(formula, data) + self.__init__( + counts, + covariates, + offsets, + offsets_formula, + dict_initialization, + take_log_offsets, + ) - def set_init_parameters(self, dict_initialization): + def _set__init_parameters(self, dict_initialization): if "coef" not in dict_initialization.keys(): print("No coef is initialized.") self.coef = None for key, array in dict_initialization.items(): - array = format_data(array) + array = _format_data(array) setattr(self, key, array) @property def fitted(self): - return + return self._fitted @property def nb_iteration_done(self): - return len(self.plotargs.elbos_list) + return len(self._plotargs._elbos_list) @property def n_samples(self): @@ -137,46 +145,46 @@ class _PLN(ABC): return 0 return self.covariates.shape[1] - def smart_init_coef(self): - self._coef = init_coef(self._counts, self._covariates, self._offsets) + def _smart_init_coef(self): + self._coef = _init_coef(self._counts, self._covariates, self._offsets) - def random_init_coef(self): + def _random_init_coef(self): if self.nb_cov == 0: self._coef = None self._coef = torch.randn((self.nb_cov, self.dim), device=DEVICE) @abstractmethod - def random_init_model_parameters(self): + def _random_init_model_parameters(self): pass @abstractmethod - def smart_init_model_parameters(self): + def _smart_init_model_parameters(self): pass @abstractmethod - def random_init_latent_parameters(self): + def _random_init_latent_parameters(self): pass - def smart_init_latent_parameters(self): + def _smart_init_latent_parameters(self): pass - def init_parameters(self, do_smart_init): + def _init_parameters(self, do_smart_init): print("Initialization ...") if do_smart_init: - self.smart_init_model_parameters() - self.smart_init_latent_parameters() + self._smart_init_model_parameters() + self._smart_init_latent_parameters() else: - self.random_init_model_parameters() - self.random_init_latent_parameters() + self._random_init_model_parameters() + self._random_init_latent_parameters() print("Initialization finished") - self.put_parameters_to_device() + self._put_parameters_to_device() - def put_parameters_to_device(self): - for parameter in self.list_of_parameters_needing_gradient: + def _put_parameters_to_device(self): + for parameter in self._list_of_parameters_needing_gradient: parameter.requires_grad_(True) @property - def list_of_parameters_needing_gradient(self): + def _list_of_parameters_needing_gradient(self): """ A list containing all the parameters that needs to be upgraded via a gradient step. """ @@ -203,26 +211,26 @@ class _PLN(ABC): offsets : torch.tensor or ndarray or DataFrame or None, default = None Model offset. If not `None`, size should be the same as `counts`. """ - self.print_beginning_message() - self.beginnning_time = time.time() + self._pring_beginning_message() + self._beginning_time = time.time() if self._fitted is False: - self.init_parameters(do_smart_init) + self._init_parameters(do_smart_init) else: - self.beginnning_time -= self.plotargs.running_times[-1] - self.optim = class_optimizer(self.list_of_parameters_needing_gradient, lr=lr) + self._beginning_time -= self._plotargs.running_times[-1] + self.optim = class_optimizer(self._list_of_parameters_needing_gradient, lr=lr) stop_condition = False while self.nb_iteration_done < nb_max_iteration and stop_condition == False: - loss = self.trainstep() - criterion = self.compute_criterion_and_update_plotargs(loss, tol) + loss = self._trainstep() + criterion = self._compute_criterion_and_update_plotargs(loss, tol) if abs(criterion) < tol: stop_condition = True if verbose and self.nb_iteration_done % 50 == 0: self.print_stats() - self.print_end_of_fitting_message(stop_condition, tol) + self._print_end_of_fitting_message(stop_condition, tol) self._fitted = True - def trainstep(self): + def _trainstep(self): """ simple docstrings with black errors """ @@ -230,12 +238,12 @@ class _PLN(ABC): loss = -self.compute_elbo() loss.backward() self.optim.step() - self.update_closed_forms() + self._update_closed_forms() return loss def pca_projected_latent_variables(self, n_components=None): if n_components is None: - n_components = self.get_max_components() + n_components = self._get_max_components() if n_components > self.dim: raise RuntimeError( f"You ask more components ({n_components}) than variables ({self.dim})" @@ -248,39 +256,39 @@ class _PLN(ABC): def latent_variables(self): pass - def print_end_of_fitting_message(self, stop_condition, tol): + def _print_end_of_fitting_message(self, stop_condition, tol): if stop_condition is True: print( f"Tolerance {tol} reached " - f"in {self.plotargs.iteration_number} iterations" + f"in {self._plotargs.iteration_number} iterations" ) else: print( "Maximum number of iterations reached : ", - self.plotargs.iteration_number, + self._plotargs.iteration_number, "last criterion = ", - np.round(self.plotargs.criterions[-1], 8), + np.round(self._plotargs.criterions[-1], 8), ) def print_stats(self): print("-------UPDATE-------") - print("Iteration number: ", self.plotargs.iteration_number) - print("Criterion: ", np.round(self.plotargs.criterions[-1], 8)) - print("ELBO:", np.round(self.plotargs.elbos_list[-1], 6)) - - def compute_criterion_and_update_plotargs(self, loss, tol): - self.plotargs.elbos_list.append(-loss.item()) - self.plotargs.running_times.append(time.time() - self.beginnning_time) - if self.plotargs.iteration_number > self.WINDOW: + print("Iteration number: ", self._plotargs.iteration_number) + print("Criterion: ", np.round(self._plotargs.criterions[-1], 8)) + print("ELBO:", np.round(self._plotargs._elbos_list[-1], 6)) + + def _compute_criterion_and_update_plotargs(self, loss, tol): + self._plotargs._elbos_list.append(-loss.item()) + self._plotargs.running_times.append(time.time() - self._beginning_time) + if self._plotargs.iteration_number > self._WINDOW: criterion = abs( - self.plotargs.elbos_list[-1] - - self.plotargs.elbos_list[-1 - self.WINDOW] + self._plotargs._elbos_list[-1] + - self._plotargs._elbos_list[-1 - self._WINDOW] ) - self.plotargs.criterions.append(criterion) + self._plotargs.criterions.append(criterion) return criterion return tol - def update_closed_forms(self): + def _update_closed_forms(self): pass @abstractmethod @@ -315,31 +323,31 @@ class _PLN(ABC): else: sns.heatmap(self.covariance, ax=ax) if savefig: - plt.savefig(name_file + self.NAME) + plt.savefig(name_file + self._NAME) plt.show() # to avoid displaying a blanck screen def __str__(self): delimiter = "=" * NB_CHARACTERS_FOR_NICE_PLOT - string = f"A multivariate Poisson Lognormal with {self.description} \n" + string = f"A multivariate Poisson Lognormal with {self._description} \n" string += f"{delimiter}\n" - string += nice_string_of_dict(self.dict_for_printing) + string += _nice_string_of_dict(self.dict_for_printing) string += f"{delimiter}\n" string += "* Useful properties\n" - string += f" {self.useful_properties_string}\n" + string += f" {self._useful_properties_string}\n" string += "* Useful methods\n" - string += f" {self.useful_methods_string}\n" - string += f"* Additional properties for {self.NAME}\n" - string += f" {self.additional_properties_string}\n" - string += f"* Additionial methods for {self.NAME}\n" - string += f" {self.additional_methods_string}" + string += f" {self._useful_methods_strings}\n" + string += f"* Additional properties for {self._NAME}\n" + string += f" {self._additional_properties_string}\n" + string += f"* Additionial methods for {self._NAME}\n" + string += f" {self._additional_methods_string}" return string @property - def additional_methods_string(self): + def _additional_methods_string(self): pass @property - def additional_properties_string(self): + def _additional_properties_string(self): pass def show(self, axes=None): @@ -351,24 +359,24 @@ class _PLN(ABC): if axes is None: _, axes = plt.subplots(1, nb_axes, figsize=(23, 5)) if self._fitted is True: - self.plotargs.show_loss(ax=axes[2]) - self.plotargs.show_stopping_criterion(ax=axes[1]) + self._plotargs._show_loss(ax=axes[2]) + self._plotargs._show_stopping_criteration(ax=axes[1]) self.display_covariance(ax=axes[0]) else: self.display_covariance(ax=axes) plt.show() @property - def elbos_list(self): - return self.plotargs.elbos_list + def _elbos_list(self): + return self._plotargs._elbos_list @property def loglike(self): if self._fitted is False: t0 = time.time() - self.plotargs.elbos_list.append(self.compute_elbo().item()) - self.plotargs.running_times.append(time.time() - t0) - return self.n_samples * self.elbos_list[-1] + self._plotargs._elbos_list.append(self.compute_elbo().item()) + self._plotargs.running_times.append(time.time() - t0) + return self.n_samples * self._elbos_list[-1] @property def BIC(self): @@ -395,24 +403,24 @@ class _PLN(ABC): } @property - def model_in_a_dict(self): - return self.dict_data | self.dict_parameters + def _model_in_a_dict(self): + return self.dict_data | self._dict_parameters @property - def dict_parameters(self): + def _dict_parameters(self): return self.model_parameters | self.latent_parameters @property def coef(self): - return self.attribute_or_none("_coef") + return self._attribute_or_none("_coef") @property def latent_mean(self): - return self.attribute_or_none("_latent_mean") + return self._attribute_or_none("_latent_mean") @property def latent_var(self): - return self.attribute_or_none("_latent_var") + return self._attribute_or_none("_latent_var") @latent_var.setter def latent_var(self, latent_var): @@ -422,7 +430,7 @@ class _PLN(ABC): def latent_mean(self, latent_mean): self._latent_mean = latent_mean - def attribute_or_none(self, attribute_name): + def _attribute_or_none(self, attribute_name): if hasattr(self, attribute_name): attr = getattr(self, attribute_name) if isinstance(attr, torch.Tensor): @@ -433,7 +441,7 @@ class _PLN(ABC): def save(self, path_of_directory="./"): path = f"{path_of_directory}/{self.path_to_directory}{self.directory_name}" os.makedirs(path, exist_ok=True) - for key, value in self.dict_parameters.items(): + for key, value in self._dict_parameters.items(): filename = f"{path}/{key}.csv" if isinstance(value, torch.Tensor): pd.DataFrame(np.array(value.cpu().detach())).to_csv( @@ -446,21 +454,21 @@ class _PLN(ABC): @property def counts(self): - return self.attribute_or_none("_counts") + return self._attribute_or_none("_counts") @property def offsets(self): - return self.attribute_or_none("_offsets") + return self._attribute_or_none("_offsets") @property def covariates(self): - return self.attribute_or_none("_covariates") + return self._attribute_or_none("_covariates") @counts.setter def counts(self, counts): - counts = to_tensor(counts) + counts = _to_tensor(counts) if hasattr(self, "_counts"): - check_dimensions_are_equal(self._counts, counts) + _check_dimensions_are_equal(self._counts, counts) self._counts = counts @offsets.setter @@ -490,12 +498,12 @@ class _PLN(ABC): return {"Number of iterations done": self.nb_iteration_done} @property - def useful_properties_string(self): + def _useful_properties_string(self): return ".latent_variables, .model_parameters, .latent_parameters, \ .optim_parameters" @property - def useful_methods_string(self): + def _useful_methods_strings(self): return ".show(), .coef() .transform(), .sigma(), .predict(), \ .pca_projected_latent_variables()" @@ -518,7 +526,7 @@ class _PLN(ABC): @property def directory_name(self): - return f"{self.NAME}_nbcov_{self.nb_cov}_dim_{self.dim}" + return f"{self._NAME}_nbcov_{self.nb_cov}_dim_{self.dim}" @property def path_to_directory(self): @@ -527,11 +535,11 @@ class _PLN(ABC): # need to do a good init for M and S class PLN(_PLN): - NAME = "PLN" + _NAME = "PLN" coef: torch.Tensor @property - def description(self): + def _description(self): return "full covariance model." @property @@ -544,20 +552,20 @@ class PLN(_PLN): def coef(self, coef): pass - def smart_init_latent_parameters(self): - self.random_init_latent_parameters() + def _smart_init_latent_parameters(self): + self._random_init_latent_parameters() - def random_init_latent_parameters(self): + def _random_init_latent_parameters(self): if not hasattr(self, "_latent_var"): self._latent_var = 1 / 2 * torch.ones((self.n_samples, self.dim)).to(DEVICE) if not hasattr(self, "_latent_mean"): self._latent_mean = torch.ones((self.n_samples, self.dim)).to(DEVICE) @property - def list_of_parameters_needing_gradient(self): + def _list_of_parameters_needing_gradient(self): return [self._latent_mean, self._latent_var] - def get_max_components(self): + def _get_max_components(self): return self.dim def compute_elbo(self): @@ -574,21 +582,21 @@ class PLN(_PLN): self._latent_var, ) - def smart_init_model_parameters(self): + def _smart_init_model_parameters(self): # no model parameters since we are doing a profiled ELBO pass - def random_init_model_parameters(self): + def _random_init_model_parameters(self): # no model parameters since we are doing a profiled ELBO pass @property def _coef(self): - return closed_formula_coef(self._covariates, self._latent_mean) + return _closed_formula_coef(self._covariates, self._latent_mean) @property def _covariance(self): - return closed_formula_covariance( + return _closed_formula_covariance( self._covariates, self._latent_mean, self._latent_var, @@ -596,8 +604,8 @@ class PLN(_PLN): self.n_samples, ) - def print_beginning_message(self): - print(f"Fitting a PLN model with {self.description}") + def _pring_beginning_message(self): + print(f"Fitting a PLN model with {self._description}") @property def latent_variables(self): @@ -630,9 +638,8 @@ class PLN(_PLN): pass -## en train d'essayer de faire une seule init pour_PLNPCA class PLNPCA: - NAME = "PLNPCA" + _NAME = "PLNPCA" @singledispatchmethod def __init__( @@ -643,27 +650,31 @@ class PLNPCA: offsets_formula="logsum", ranks=range(3, 5), dict_of_dict_initialization=None, + take_log_offsets=False, ): - self.init_data(counts, covariates, offsets, offsets_formula) - self.init_models(ranks, dict_of_dict_initialization) + self._init_data(counts, covariates, offsets, offsets_formula, take_log_offsets) + self._init_models(ranks, dict_of_dict_initialization) - def init_data(self, counts, covariates, offsets, offsets_formula): - self._counts, self._covariates, self._offsets = format_model_param( - counts, covariates, offsets, offsets_formula + def _init_data( + self, counts, covariates, offsets, offsets_formula, take_log_offsets + ): + self._counts, self._covariates, self._offsets = _format_model_param( + counts, covariates, offsets, offsets_formula, take_log_offsets ) - check_data_shape(self._counts, self._covariates, self._offsets) + _check_data_shape(self._counts, self._covariates, self._offsets) self._fitted = False @__init__.register(str) def _( self, - formula: str, + formula, data: dict, offsets_formula="logsum", ranks=range(3, 5), dict_of_dict_initialization=None, + take_log_offsets=False, ): - counts, covariates, offsets = extract_data_from_formula(formula, data) + counts, covariates, offsets = _extract_data_from_formula(formula, data) self.__init__( counts, covariates, @@ -671,6 +682,7 @@ class PLNPCA: offsets_formula, ranks, dict_of_dict_initialization, + take_log_offsets, ) @property @@ -683,14 +695,14 @@ class PLNPCA: @counts.setter def counts(self, counts): - counts = format_data(counts) + counts = _format_data(counts) if hasattr(self, "_counts"): - check_dimensions_are_equal(self._counts, counts) + _check_dimensions_are_equal(self._counts, counts) self._counts = counts @covariates.setter def covariates(self, covariates): - covariates = format_data(covariates) + covariates = _format_data(covariates) # if hasattr(self,) self._covariates = covariates @@ -698,12 +710,12 @@ class PLNPCA: def offsets(self): return self.list_models[0].offsets - def init_models(self, ranks, dict_of_dict_initialization): + def _init_models(self, ranks, dict_of_dict_initialization): if isinstance(ranks, (Iterable, np.ndarray)): self.list_models = [] for rank in ranks: if isinstance(rank, (int, np.integer)): - dict_initialization = get_dict_initialization( + dict_initialization = _get_dict_initalization( rank, dict_of_dict_initialization ) self.list_models.append( @@ -721,7 +733,7 @@ class PLNPCA: f"of integers or an integer." ) elif isinstance(ranks, (int, np.integer)): - dict_initialization = get_dict_initialization( + dict_initialization = _get_dict_initalization( ranks, dict_of_dict_initialization ) self.list_models = [ @@ -746,7 +758,7 @@ class PLNPCA: def dict_models(self): return {model.rank: model for model in self.list_models} - def print_beginning_message(self): + def _pring_beginning_message(self): return f"Adjusting {len(self.ranks)} PLN models for PCA analysis \n" @property @@ -768,7 +780,7 @@ class PLNPCA: do_smart_init=True, verbose=False, ): - self.print_beginning_message() + self._pring_beginning_message() for pca in self.dict_models.values(): pca.fit( nb_max_iteration, @@ -778,25 +790,25 @@ class PLNPCA: do_smart_init, verbose, ) - self.print_ending_message() + self._print_ending_message() - def print_ending_message(self): + def _print_ending_message(self): delimiter = "=" * NB_CHARACTERS_FOR_NICE_PLOT print(f"{delimiter}\n") print("DONE!") - print(f" Best model(lower BIC): {self.criterion_dict('BIC')}\n ") - print(f" Best model(lower AIC): {self.criterion_dict('AIC')}\n ") + print(f" Best model(lower BIC): {self._criterion_dict('BIC')}\n ") + print(f" Best model(lower AIC): {self._criterion_dict('AIC')}\n ") print(f"{delimiter}\n") - def criterion_dict(self, criterion="AIC"): + def _criterion_dict(self, criterion="AIC"): return self.best_model(criterion).rank def __getitem__(self, rank): if (rank in self.ranks) is False: asked_rank = rank - rank = closest(self.ranks, asked_rank) + rank = _closest(self.ranks, asked_rank) warning_string = " \n No such a model in the collection." - warning_string += "Returning model with closest value.\n" + warning_string += "Returning model with _closest value.\n" warning_string += f"Requested: {asked_rank}, returned: {rank}" warnings.warn(message=warning_string) return self.dict_models[rank] @@ -861,7 +873,7 @@ class PLNPCA: @property def directory_name(self): - return f"{self.NAME}_nbcov_{self.nb_cov}_dim_{self.dim}" + return f"{self._NAME}_nbcov_{self.nb_cov}_dim_{self.dim}" @property def n_samples(self): @@ -884,55 +896,55 @@ class PLNPCA: to_print += delimiter to_print += f" - Ranks considered:{self.ranks}\n" dict_bic = {"rank": "criterion"} | self.BIC - to_print += f" - BIC metric:\n{nice_string_of_dict(dict_bic)}\n" + to_print += f" - BIC metric:\n{_nice_string_of_dict(dict_bic)}\n" dict_to_print = self.best_model(criterion="BIC")._rank to_print += f" Best model(lower BIC): {dict_to_print}\n \n" dict_aic = {"rank": "criterion"} | self.AIC - to_print += f" - AIC metric:\n{nice_string_of_dict(dict_aic)}\n" + to_print += f" - AIC metric:\n{_nice_string_of_dict(dict_aic)}\n" to_print += f" Best model(lower AIC): \ {self.best_model(criterion = 'AIC')._rank}\n" to_print += delimiter to_print += f"* Useful properties\n" - to_print += f" {self.useful_properties_string}\n" + to_print += f" {self._useful_properties_string}\n" to_print += "* Useful methods \n" - to_print += f" {self.useful_methods_string}" + to_print += f" {self._useful_methods_strings}" to_print += delimiter return to_print @property - def useful_methods_string(self): + def _useful_methods_strings(self): return ".show(), .best_model()" @property - def useful_properties_string(self): + def _useful_properties_string(self): return ".BIC, .AIC, .loglikes" -# Here, setting the value for each key in dict_parameters +# Here, setting the value for each key in _dict_parameters class _PLNPCA(_PLN): - NAME = "_PLNPCA" + _NAME = "_PLNPCA" _components: torch.Tensor @singledispatchmethod def __init__(self, counts, covariates, offsets, rank, dict_initialization=None): self._rank = rank - self._counts, self._covariates, self._offsets = format_model_param( - counts, covariates, offsets, None + self._counts, self._covariates, self._offsets = _format_model_param( + counts, covariates, offsets, None, take_log_offsets=False ) - check_data_shape(self._counts, self._covariates, self._offsets) - self.check_if_rank_is_too_high() + _check_data_shape(self._counts, self._covariates, self._offsets) + self._check_if_rank_is_too_high() if dict_initialization is not None: - self.set_init_parameters(dict_initialization) + self._set__init_parameters(dict_initialization) self._fitted = False - self.plotargs = PLNPlotArgs(self.WINDOW) + self._plotargs = _PlotArgs(self._WINDOW) @__init__.register(str) def _(self, formula, data, rank, dict_initialization): - counts, covariates, offsets = extract_data_from_formula(formula, data) + counts, covariates, offsets = _extract_data_from_formula(formula, data) self.__init__(counts, covariates, offsets, rank, dict_initialization) - def check_if_rank_is_too_high(self): + def _check_if_rank_is_too_high(self): if self.dim < self.rank: warning_string = ( f"\nThe requested rank of approximation {self.rank} " @@ -944,8 +956,8 @@ class _PLNPCA(_PLN): @property def directory_name(self): - return f"{self.NAME}_rank_{self._rank}" - # return f"PLNPCA_nbcov_{self.nb_cov}_dim_{self.dim}/{self.NAME}_rank_{self._rank}" + return f"{self._NAME}_rank_{self._rank}" + # return f"PLNPCA_nbcov_{self.nb_cov}_dim_{self.dim}/{self._NAME}_rank_{self._rank}" @property def path_to_directory(self): @@ -955,10 +967,10 @@ class _PLNPCA(_PLN): def rank(self): return self._rank - def get_max_components(self): + def _get_max_components(self): return self._rank - def print_beginning_message(self): + def _pring_beginning_message(self): print("-" * NB_CHARACTERS_FOR_NICE_PLOT) print(f"Fitting a PLNPCA model with {self._rank} components") @@ -966,26 +978,26 @@ class _PLNPCA(_PLN): def model_parameters(self): return {"coef": self.coef, "components": self.components} - def smart_init_model_parameters(self): + def _smart_init_model_parameters(self): if not hasattr(self, "_coef"): - super().smart_init_coef() + super()._smart_init_coef() if not hasattr(self, "_components"): - self._components = init_components( + self._components = _init_components( self._counts, self._covariates, self._coef, self._rank ) - def random_init_model_parameters(self): - super().random_init_coef() + def _random_init_model_parameters(self): + super()._random_init_coef() self._components = torch.randn((self.dim, self._rank)).to(DEVICE) - def random_init_latent_parameters(self): + def _random_init_latent_parameters(self): self._latent_var = 1 / 2 * torch.ones((self.n_samples, self._rank)).to(DEVICE) self._latent_mean = torch.ones((self.n_samples, self._rank)).to(DEVICE) - def smart_init_latent_parameters(self): + def _smart_init_latent_parameters(self): if not hasattr(self, "_latent_mean"): self._latent_mean = ( - init_latent_mean( + _init_latent_mean( self._counts, self._covariates, self._offsets, @@ -1001,7 +1013,7 @@ class _PLNPCA(_PLN): ) @property - def list_of_parameters_needing_gradient(self): + def _list_of_parameters_needing_gradient(self): if self._coef is None: return [self._components, self._latent_mean, self._latent_var] return [self._components, self._coef, self._latent_mean, self._latent_var] @@ -1022,11 +1034,11 @@ class _PLNPCA(_PLN): return self.dim * (self.nb_cov + self._rank) - self._rank * (self._rank - 1) / 2 @property - def additional_properties_string(self): + def _additional_properties_string(self): return ".projected_latent_variables" @property - def additional_methods_string(self): + def _additional_methods_string(self): string = " only for rank=2: .viz()" return string @@ -1040,7 +1052,7 @@ class _PLNPCA(_PLN): return None @property - def description(self): + def _description(self): return f" {self.rank} principal component." @property @@ -1054,7 +1066,7 @@ class _PLNPCA(_PLN): def pca_projected_latent_variables(self, n_components=None): if n_components is None: - n_components = self.get_max_components() + n_components = self._get_max_components() if n_components > self.dim: raise RuntimeError( f"You ask more components ({n_components}) than variables ({self.dim})" @@ -1064,7 +1076,7 @@ class _PLNPCA(_PLN): @property def components(self): - return self.attribute_or_none("_components") + return self._attribute_or_none("_components") @components.setter def components(self, components): @@ -1084,7 +1096,7 @@ class _PLNPCA(_PLN): sns.scatterplot(x=x, y=y, hue=colors, ax=ax) covariances = torch.diag_embed(self._latent_var**2).detach().cpu() for i in range(covariances.shape[0]): - plot_ellipse(x[i], y[i], cov=covariances[i], ax=ax) + _plot_ellipse(x[i], y[i], cov=covariances[i], ax=ax) return ax def transform(self, project=True): @@ -1094,32 +1106,32 @@ class _PLNPCA(_PLN): class ZIPLN(PLN): - NAME = "ZIPLN" + _NAME = "ZIPLN" _pi: torch.Tensor _coef_inflation: torch.Tensor _dirac: torch.Tensor @property - def description(self): + def _description(self): return "with full covariance model and zero-inflation." - def random_init_model_parameters(self): - super().random_init_model_parameters() + def _random_init_model_parameters(self): + super()._random_init_model_parameters() self._coef_inflation = torch.randn(self.nb_cov, self.dim) self._covariance = torch.diag(torch.ones(self.dim)).to(DEVICE) # should change the good initialization, especially for _coef_inflation - def smart_init_model_parameters(self): - super().smart_init_model_parameters() + def _smart_init_model_parameters(self): + super()._smart_init_model_parameters() if not hasattr(self, "_covariance"): - self._covariance = init_covariance( + self._covariance = _init_covariance( self._counts, self._covariates, self._coef ) if not hasattr(self, "_coef_inflation"): self._coef_inflation = torch.randn(self.nb_cov, self.dim) - def random_init_latent_parameters(self): + def _random_init_latent_parameters(self): self._dirac = self._counts == 0 self._latent_mean = torch.randn(self.n_samples, self.dim) self._latent_var = torch.randn(self.n_samples, self.dim) @@ -1143,19 +1155,19 @@ class ZIPLN(PLN): ) @property - def list_of_parameters_needing_gradient(self): + def _list_of_parameters_needing_gradient(self): return [self._latent_mean, self._latent_var, self._coef_inflation] - def update_closed_forms(self): - self._coef = closed_formula_coef(self._covariates, self._latent_mean) - self._covariance = closed_formula_covariance( + def _update_closed_forms(self): + self._coef = _closed_formula_coef(self._covariates, self._latent_mean) + self._covariance = _closed_formula_covariance( self._covariates, self._latent_mean, self._latent_var, self._coef, self.n_samples, ) - self._pi = closed_formula_pi( + self._pi = _closed_formula_pi( self._offsets, self._latent_mean, self._latent_var, diff --git a/setup.py b/setup.py index 01f54f5c23e97e80ac64fc9f6cbcb07b3fc3d950..cf7dd83ce97d998c553e4362d4cefeeaaaada2e6 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from setuptools import setup, find_packages -VERSION = "0.0.41" +VERSION = "0.0.47" with open("README.md", "r") as fh: long_description = fh.read() diff --git a/test.py b/test.py deleted file mode 100644 index b9d5e686d2a75cc14ad36c59459700ddf463bbe0..0000000000000000000000000000000000000000 --- a/test.py +++ /dev/null @@ -1,30 +0,0 @@ -from pyPLNmodels.models import PLNPCA, _PLNPCA, PLN -from pyPLNmodels import get_real_count_data, get_simulated_count_data - -import os -import pandas as pd -import numpy as np - -os.chdir("./pyPLNmodels/") - - -counts = get_real_count_data() -covariates = None -offsets = None -# counts, covariates, offsets = get_simulated_count_data(seed = 0) - -pca = PLNPCA(counts, covariates, offsets, ranks=[3, 4]) -pca.fit(tol=0.1) - -# pca.fit() -# print(pca) -# data = pd.DataFrame(counts) -# pln = PLN("counts~1", data) -# pln.fit() -# print(pln) -# pcamodel = pca.best_model() -# pcamodel.save() -# model = PLNPCA([4])[4] - -# model.load() -# # pln.fit(counts, covariates, offsets, tol=0.1) diff --git a/tests/conftest.py b/tests/conftest.py index 904240c9611cc87e63c8495600d21bfdfe94b6a8..1fd52a44e29889465b7a832f536dab2c86e59d81 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -104,7 +104,7 @@ def _(formula, data, offsets_formula=None, dict_initialization=None): def generate_new_model(model, *args, **kwargs): name_dir = model.directory_name - name = model.NAME + name = model._NAME if name in ("PLN", "_PLNPCA"): path = model.path_to_directory + name_dir init = load_model(path) diff --git a/tests/test_common.py b/tests/test_common.py index b74cf9fbae73676d7c5bcf34643324c63e07ad4a..42ae334919d9740e94b8481bd19c21269b2cba95 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -27,8 +27,8 @@ def test_print(any_pln): @filter_models(["PLN", "_PLNPCA"]) def test_show_coef_transform_covariance_pcaprojected(any_pln): any_pln.show() - any_pln.plotargs.show_loss() - any_pln.plotargs.show_stopping_criterion() + any_pln._plotargs._show_loss() + any_pln._plotargs._show_stopping_criteration() assert hasattr(any_pln, "coef") assert callable(any_pln.transform) assert hasattr(any_pln, "covariance") @@ -105,7 +105,7 @@ def test_random_init(instance): @pytest.mark.parametrize("instance", dict_fixtures["instances"]) -def test_print_end_of_fitting_message(instance): +def test__print_end_of_fitting_message(instance): instance.fit(nb_max_iteration=4) diff --git a/tests/test_pln_full.py b/tests/test_pln_full.py index 1f9d6a5d06dd4e207db265ed5981629d9da19e88..a98873536dd581493020e13662ea979db340a7b0 100644 --- a/tests/test_pln_full.py +++ b/tests/test_pln_full.py @@ -7,7 +7,7 @@ from tests.utils import filter_models @pytest.mark.parametrize("fitted_pln", dict_fixtures["fitted_pln"]) @filter_models(["PLN"]) def test_number_of_iterations_pln_full(fitted_pln): - nb_iterations = len(fitted_pln.elbos_list) + nb_iterations = len(fitted_pln._elbos_list) assert 50 < nb_iterations < 300 diff --git a/tests/test_plnpca.py b/tests/test_plnpca.py index 9eb1b2f4289efce04cef20a824570e407aede96e..85f4288517e8253ec5be2997769a2e444790bb96 100644 --- a/tests/test_plnpca.py +++ b/tests/test_plnpca.py @@ -26,7 +26,7 @@ def test_projected_variables(plnpca): @pytest.mark.parametrize("fitted_pln", dict_fixtures["fitted_pln"]) @filter_models(["_PLNPCA"]) def test_number_of_iterations_plnpca(fitted_pln): - nb_iterations = len(fitted_pln.elbos_list) + nb_iterations = len(fitted_pln._elbos_list) assert 100 < nb_iterations < 5000 @@ -64,7 +64,7 @@ def test_viz_pca(plnpca): @pytest.mark.parametrize("plnpca", dict_fixtures["loaded_and_fitted_pln"]) @filter_models(["PLNPCA"]) -def test_closest(plnpca): +def test__closest(plnpca): with pytest.warns(UserWarning): plnpca[9]