diff --git a/examples/robust_regression.py b/examples/robust_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..08d4d3112264957b58f461d65269fe7ddf7ff4e6 --- /dev/null +++ b/examples/robust_regression.py @@ -0,0 +1,132 @@ +# Demo of a robust regression model with multivariate-t distributed noise + +import numpy as np +import numpy.random as npr +np.random.seed(0) + +import matplotlib.pyplot as plt +import seaborn as sns +sns.set_style("white") + +from pybasicbayes.util.text import progprint_xrange +from pybasicbayes.distributions import Regression, RobustRegression + +D_out = 1 +D_in = 2 +N = 100 + +# Make a regression model and simulate data +A = npr.randn(D_out, D_in) +b = npr.randn(D_out) +Sigma = 0.1 * np.eye(D_out) + +true_reg = Regression(A=np.column_stack((A, b)), sigma=Sigma, affine=True) +X = npr.randn(N, D_in) +y = true_reg.rvs(x=X, return_xy=False) + +# Corrupt a fraction of the data +inds = npr.rand(N) < 0.1 +y[inds] = 3 * npr.randn(inds.sum(), D_out) + +# Make a test regression and fit it +std_reg = Regression(nu_0=D_out + 2, + S_0=np.eye(D_out), + M_0=np.zeros((D_out, D_in+1)), + K_0=np.eye(D_in+1), + affine=True) + +robust_reg = RobustRegression(nu_0=D_out+2, + S_0=np.eye(D_out), + M_0=np.zeros((D_out, D_in+1)), + K_0=np.eye(D_in+1), + affine=True) + +def _collect(r): + ll = r.log_likelihood((X, y))[~inds].sum() + err = ((y - r.predict(X))**2).sum(1) + mse = np.mean(err[~inds]) + return r.A.copy(), ll, mse + +def _update(r): + r.resample([(X,y)]) + return _collect(r) + +# Fit the standard regression +smpls = [_collect(std_reg)] +for _ in progprint_xrange(100): + smpls.append(_update(std_reg)) +smpls = zip(*smpls) +std_As, std_lls, std_mses = tuple(map(np.array, smpls)) + +# Fit the robust regression +smpls = [_collect(robust_reg)] +for _ in progprint_xrange(100): + smpls.append(_update(robust_reg)) +smpls = zip(*smpls) +robust_As, robust_lls, robust_mses = tuple(map(np.array, smpls)) + + +# Plot the inferred regression function +plt.figure(figsize=(8, 4)) +xlim = (-3, 3) +ylim = abs(y).max() +npts = 50 +x1, x2 = np.meshgrid(np.linspace(*xlim, npts), np.linspace(*xlim, npts)) + +plt.subplot(131) +mu = true_reg.predict(np.column_stack((x1.ravel(), x2.ravel()))) +plt.imshow(mu.reshape((npts, npts)), + cmap="RdBu", vmin=-ylim, vmax=ylim, + alpha=0.8, + extent=xlim + tuple(reversed(xlim))) +plt.scatter(X[~inds,0], X[~inds,1], c=y[~inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='gray') +plt.scatter(X[inds,0], X[inds,1], c=y[inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='k', linewidths=1) +plt.xlim(xlim) +plt.ylim(xlim) +plt.title("True") + +plt.subplot(132) +mu = std_reg.predict(np.column_stack((x1.ravel(), x2.ravel()))) +plt.imshow(mu.reshape((npts, npts)), + cmap="RdBu", vmin=-ylim, vmax=ylim, + alpha=0.8, + extent=xlim + tuple(reversed(xlim))) +plt.scatter(X[~inds,0], X[~inds,1], c=y[~inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='gray') +plt.scatter(X[inds,0], X[inds,1], c=y[inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='k', linewidths=1) +plt.xlim(xlim) +plt.ylim(xlim) +plt.title("Standard Regression") + +plt.subplot(133) +mu = robust_reg.predict(np.column_stack((x1.ravel(), x2.ravel()))) +plt.imshow(mu.reshape((npts, npts)), + cmap="RdBu", vmin=-ylim, vmax=ylim, + alpha=0.8, + extent=xlim + tuple(reversed(xlim))) +plt.scatter(X[~inds,0], X[~inds,1], c=y[~inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='gray') +plt.scatter(X[inds,0], X[inds,1], c=y[inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='k', linewidths=1) +plt.xlim(xlim) +plt.ylim(xlim) +plt.title("Robust Regression") + + +print("True A: {}".format(true_reg.A)) +print("Std A: {}".format(std_As.mean(0))) +print("Robust A: {}".format(robust_As.mean(0))) + +# Plot the log likelihoods and mean squared errors +plt.figure(figsize=(8, 4)) +plt.subplot(121) +plt.plot(std_lls) +plt.plot(robust_lls) +plt.xlabel("Iteration") +plt.ylabel("Log Likelihood") + +plt.subplot(122) +plt.plot(std_mses, label="Standard") +plt.plot(robust_mses, label="Robust") +plt.legend(loc="upper right") +plt.xlabel("Iteration") +plt.ylabel("Mean Squared Error") + +plt.show() diff --git a/pybasicbayes/distributions/gaussian.py b/pybasicbayes/distributions/gaussian.py index 66ceed683f92f8e331bf31fdb5d589427f02da77..f283986f623439653a8dd78270a17a3584df8bab 100644 --- a/pybasicbayes/distributions/gaussian.py +++ b/pybasicbayes/distributions/gaussian.py @@ -344,10 +344,11 @@ class Gaussian( - D*self.kappa_0/self.kappa_mf - self.kappa_0*self.nu_mf* np.dot(self.mu_mf - self.mu_0,np.linalg.solve(self.sigma_mf,self.mu_mf - self.mu_0))) \ - + invwishart_log_partitionfunction(self.sigma_0,self.nu_0) \ + - invwishart_log_partitionfunction(self.sigma_0,self.nu_0) \ + (self.nu_0 - D - 1)/2*loglmbdatilde - 1/2*self.nu_mf \ * np.linalg.solve(self.sigma_mf,self.sigma_0).trace() + return p_avgengy + q_entropy def expected_log_likelihood(self, x=None, stats=None): diff --git a/pybasicbayes/distributions/regression.py b/pybasicbayes/distributions/regression.py index 9e540272ab1e863718929a5966a3281698277bcf..b524061ee1a0f4ea1618164c61f143db695f4cc7 100644 --- a/pybasicbayes/distributions/regression.py +++ b/pybasicbayes/distributions/regression.py @@ -2,11 +2,17 @@ from __future__ import division from builtins import zip from builtins import range __all__ = ['Regression', 'RegressionNonconj', 'ARDRegression', - 'AutoRegression', 'ARDAutoRegression', 'DiagonalRegression'] + 'AutoRegression', 'ARDAutoRegression', 'DiagonalRegression', + 'RobustRegression', 'RobustAutoRegression'] + +from warnings import warn import numpy as np from numpy import newaxis as na +from scipy.linalg import solve_triangular +from scipy.special import gammaln, digamma, polygamma + from pybasicbayes.abstractions import GibbsSampling, MaxLikelihood, \ MeanField, MeanFieldSVI from pybasicbayes.util.stats import sample_gaussian, sample_mniw, \ @@ -86,12 +92,15 @@ class Regression(GibbsSampling, MeanField, MaxLikelihood): @staticmethod def _natural_to_standard(natparam): - A,B,C,d = natparam + A,B,C,d = natparam # natparam is roughly (yyT, yxT, xxT, n) nu = d Kinv = C K = inv_psd(Kinv) - M = B.dot(K) - S = A - B.dot(K).dot(B.T) + # M = B.dot(K) + M = np.linalg.solve(Kinv, B.T).T + # This subtraction seems unstable! + # It does not necessarily return a PSD matrix + S = A - M.dot(B.T) # numerical padding here... K += 1e-8*np.eye(K.shape[0]) @@ -99,6 +108,7 @@ class Regression(GibbsSampling, MeanField, MaxLikelihood): assert np.all(0 < np.linalg.eigvalsh(S)) assert np.all(0 < np.linalg.eigvalsh(K)) + # standard is degrees of freedom, mean of sigma (ish), mean of A, cov of rows of A return nu, S, M, K ### getting statistics @@ -564,6 +574,10 @@ class DiagonalRegression(Regression, MeanFieldSVI): # Cache the standard parameters for A as well self._mf_A_cache = {} + # Store the natural hypparams. These correspond to the suff. stats + # (y^2, yxT, xxT, n) + # self.natural_hypparam = (2 * self.beta_0, self.h_0, self.J_0, 1.0) + @property def D_out(self): return self._D_out @@ -761,7 +775,6 @@ class DiagonalRegression(Regression, MeanFieldSVI): ysq, yxT, xxT, n = stats - assert np.all(n > 0), "Cannot perform max likelihood with zero data points!" self.A = np.array([ np.linalg.solve(self.J_0 + xxTd, self.h_0 + yxTd) for xxTd, yxTd in zip(xxT, yxT) @@ -787,6 +800,11 @@ class DiagonalRegression(Regression, MeanFieldSVI): self._meanfieldupdate_A(stats) self._meanfieldupdate_sigma(stats) + # Update A and sigmasq_flat + A, _, sigmasq_inv, _ = self.mf_expectations + self.A = A.copy() + self.sigmasq_flat = 1. / sigmasq_inv + def _meanfieldupdate_A(self, stats, prob=1.0, stepsize=1.0): E_sigmasq_inv = self.mf_alpha / self.mf_beta _, E_yxT, E_xxT, _ = stats / prob @@ -890,6 +908,249 @@ class DiagonalRegression(Regression, MeanFieldSVI): self._meanfieldupdate_sigma(stats, prob=prob, stepsize=stepsize) +class RobustRegression(Regression): + """ + Regression with multivariate-t distributed noise. + + y | x ~ t(Ax + b, \Sigma, \nu) + + where \nu >= 1 is the degrees of freedom. + + This is equivalent to the model, + + \tau ~ Gamma(\nu/2, \nu/2) + y | x, \tau ~ N(Ax + b, \Sigma / \tau) + + To perform inference in this model, we will introduce + auxiliary variables tau (precisions). With these, we + can compute sufficient statistics scaled by \tau and + use the standard regression object to + update A, b, Sigma | x, y, \tau. + + The degrees of freedom parameter \nu is updated via maximum + likelihood using a generalized Newton's method proposed by + Tom Minka. We are not using any prior on \nu, but we + could experiment with updating \nu under an + uninformative prior, e.g. p(\nu) \propto \nu^{-2}, + which is equivalent to a flat prior on \nu^{-1}. + """ + def __init__( + self, nu_0=None,S_0=None, M_0=None, K_0=None, affine=False, + A=None, sigma=None, nu=None): + + # Default to a somewhat intermediate value of nu + self.nu = self.default_nu = nu if nu is not None else 4.0 + + super(RobustRegression, self).__init__( + nu_0=nu_0, S_0=S_0, M_0=M_0, K_0=K_0, affine=affine, A=A, sigma=sigma) + + def log_likelihood(self,xy): + assert isinstance(xy, (tuple, np.ndarray)) + sigma, D, nu = self.sigma, self.D_out, self.nu + x, y = (xy[:,:-D], xy[:,-D:]) if isinstance(xy,np.ndarray) else xy + + sigma_inv, L = inv_psd(sigma, return_chol=True) + r = y - self.predict(x) + z = sigma_inv.dot(r.T).T + + out = -0.5 * (nu + D) * np.log(1.0 + (r * z).sum(1) / nu) + out += gammaln((nu + D) / 2.0) - gammaln(nu / 2.0) - D / 2.0 * np.log(nu) \ + - D / 2.0 * np.log(np.pi) - np.log(np.diag(L)).sum() + + return out + + def rvs(self,x=None,size=1,return_xy=True): + A, sigma, nu, D = self.A, self.sigma, self.nu, self.D_out + + if self.affine: + A, b = A[:,:-1], A[:,-1] + + x = np.random.normal(size=(size, A.shape[1])) if x is None else x + N = x.shape[0] + mu = self.predict(x) + + # Sample precisions and t-distributed residuals + tau = np.random.gamma(nu / 2.0, 2.0 / nu, size=(N,)) + resid = np.random.randn(N, D).dot(np.linalg.cholesky(sigma).T) + resid /= np.sqrt(tau[:, None]) + + y = mu + resid + return np.hstack((x,y)) if return_xy else y + + def _get_statistics(self, data): + raise Exception("RobustRegression needs scaled statistics.") + + def _get_scaled_statistics(self, data, precisions): + assert isinstance(data, (list, tuple, np.ndarray)) + if isinstance(data,list): + return sum((self._get_scaled_statistics(d, p) for d, p in zip(data, precisions)), + self._empty_statistics()) + + elif isinstance(data, tuple): + x, y = data + bad = np.isnan(x).any(1) | np.isnan(y).any(1) + x, y = x[~bad], y[~bad] + precisions = precisions[~bad] + sqrt_prec = np.sqrt(precisions) + n, D = y.shape + + if self.affine: + x = np.column_stack((x, np.ones(n))) + + # Scale by the precision + # xs = x * sqrt_prec[:, na] + # ys = y * sqrt_prec[:, na] + xs = x * np.tile(sqrt_prec[:, None], (1, x.shape[1])) + ys = y * np.tile(sqrt_prec[:, None], (1, D)) + + xxT, yxT, yyT = xs.T.dot(xs), ys.T.dot(xs), ys.T.dot(ys) + return np.array([yyT, yxT, xxT, n]) + + else: + # data passed in like np.hstack((x, y)) + # x, y = data[:,:-self.D_out], data[:,-self.D_out:] + # return self._get_scaled_statistics((x, y), precisions) + bad = np.isnan(data).any(1) + data = data[~bad] + precisions = precisions[~bad] + n, D = data.shape[0], self.D_out + + # This tile call is suboptimal but without it we can hit issues + # with strided data, as in autoregressive models. + scaled_data = data * np.tile(precisions[:,None], (1, data.shape[1])) + statmat = scaled_data.T.dot(data) + + xxT, yxT, yyT = \ + statmat[:-D,:-D], statmat[-D:,:-D], statmat[-D:,-D:] + + if self.affine: + xy = scaled_data.sum(0) + x, y = xy[:-D], xy[-D:] + xxT = blockarray([[xxT, x[:,na]], + [x[na,:], np.atleast_2d(precisions.sum())]]) + yxT = np.hstack((yxT, y[:,na])) + + return np.array([yyT, yxT, xxT, n]) + + def resample(self, data=[], stats=None): + assert stats is None, \ + "We only support RobustRegression.resample() with data, not stats." + + # First sample auxiliary variables for each data point + tau = self._resample_precision(data) + + # Compute statistics, scaling by tau, and resample as in standard Regression + stats = self._get_scaled_statistics(data, tau) + super(RobustRegression, self).resample(stats=stats) + + # Resample degrees of freedom \nu + self._resample_nu(tau) + + def _resample_precision(self, data): + assert isinstance(data, (list, tuple, np.ndarray)) + if isinstance(data, list): + return [self._resample_precision(d) for d in data] + + elif isinstance(data, tuple): + x, y = data + + else: + x, y = data[:, :-self.D_out], data[:, -self.D_out:] + + assert x.ndim == y.ndim == 2 + assert x.shape[0] == y.shape[0] + assert x.shape[1] == self.D_in - 1 if self.affine else self.D_in + assert y.shape[1] == self.D_out + N = x.shape[0] + + # Weed out the nan's + bad = np.any(np.isnan(x), axis=1) | np.any(np.isnan(y), axis=1) + + # Compute posterior params of gamma distribution + a_post = self.nu / 2.0 + self.D_out / 2.0 + + r = y - self.predict(x) + sigma_inv = inv_psd(self.sigma) + z = sigma_inv.dot(r.T).T + b_post = self.nu / 2.0 + (r * z).sum(1) / 2.0 + + assert np.isscalar(a_post) and b_post.shape == (N,) + tau = np.nan * np.ones(N) + tau[~bad] = np.random.gamma(a_post, 1./b_post[~bad]) + + return tau + + def _resample_nu(self, tau, N_steps=100, prop_std=0.1, alpha=1, beta=1): + """ + Update the degree of freedom parameter with + Metropolis-Hastings. Assume a prior nu ~ Ga(alpha, beta) + and use a proposal nu' ~ N(nu, prop_std^2). If proposals + are negative, reject automatically due to likelihood. + """ + # Convert tau to a list of arrays + taus = [tau] if isinstance(tau, np.ndarray) else tau + + N = 0 + E_tau = 0 + E_logtau = 0 + for tau in taus: + bad = ~np.isfinite(tau) + N += np.sum(~bad) + E_tau += np.sum(tau[~bad]) + E_logtau += np.sum(np.log(tau[~bad])) + + if N > 0: + E_tau /= N + E_logtau /= N + + # Compute the log prior, likelihood, and posterior + lprior = lambda nu: (alpha - 1) * np.log(nu) - alpha * nu + ll = lambda nu: N * (nu/2 * np.log(nu/2) - gammaln(nu/2) + (nu/2 - 1) * E_logtau - nu/2 * E_tau) + lp = lambda nu: ll(nu) + lprior(nu) + + lp_curr = lp(self.nu) + for step in range(N_steps): + # Symmetric proposal + nu_new = self.nu + prop_std * np.random.randn() + if nu_new <1e-3: + # Reject if too small + continue + + # Accept / reject based on likelihoods + lp_new = lp(nu_new) + if np.log(np.random.rand()) < lp_new - lp_curr: + self.nu = nu_new + lp_curr = lp_new + + # Not supporting MLE or mean field for now + def max_likelihood(self,data,weights=None,stats=None): + raise NotImplementedError + + def meanfieldupdate(self, data=None, weights=None, stats=None): + raise NotImplementedError + + def meanfield_sgdstep(self, data, weights, prob, stepsize, stats=None): + raise NotImplementedError + + def meanfield_expectedstats(self): + raise NotImplementedError + + def expected_log_likelihood(self, xy=None, stats=None): + raise NotImplementedError + + def get_vlb(self): + raise NotImplementedError + + def resample_from_mf(self): + raise NotImplementedError + + def _set_params_from_mf(self): + raise NotImplementedError + + def _initialize_mean_field(self): + pass + + class _ARMixin(object): @property def nlags(self): @@ -903,7 +1164,7 @@ class _ARMixin(object): return self.D_out def predict(self, x): - return super(_ARMixin,self).predict(np.atleast_2d(x.ravel())) + return super(_ARMixin,self).predict(np.atleast_2d(x)) def rvs(self,lagged_data): return super(_ARMixin,self).rvs( @@ -939,3 +1200,7 @@ class ARDAutoRegression(_ARMixin,ARDRegression): + ([1] if M_0.shape[1] % M_0.shape[0] and M_0.shape[0] != 1 else []) super(ARDAutoRegression,self).__init__( M_0=M_0,blocksizes=blocksizes,**kwargs) + + +class RobustAutoRegression(_ARMixin, RobustRegression): + pass diff --git a/pybasicbayes/models/factor_analysis.py b/pybasicbayes/models/factor_analysis.py index 0fa11857b16f01d0f4f12123e80115260686cc93..23aa648611b11b7e1a6a0a63c175eff98e01d1eb 100644 --- a/pybasicbayes/models/factor_analysis.py +++ b/pybasicbayes/models/factor_analysis.py @@ -23,12 +23,12 @@ class FactorAnalysisStates(object): """ Wrapper for the latent states of a factor analysis model """ - def __init__(self, model, data, mask=None): + def __init__(self, model, data, mask=None, **kwargs): self.model = model self.X = data - self.mask = mask if mask is None: mask = np.ones_like(data, dtype=bool) + self.mask = mask assert data.shape == mask.shape and mask.dtype == bool assert self.X.shape[1] == self.D_obs @@ -58,8 +58,18 @@ class FactorAnalysisStates(object): def log_likelihood(self): - mu = np.dot(self.Z, self.W.T) - return -0.5 * np.sum(((self.X - mu) * self.mask) ** 2 / self.sigmasq) + # mu = np.dot(self.Z, self.W.T) + # return -0.5 * np.sum(((self.X - mu) * self.mask) ** 2 / self.sigmasq) + + # Compute the marginal likelihood, integrating out z + mu_x = np.zeros(self.D_obs) + Sigma_x = self.W.dot(self.W.T) + np.diag(self.sigmasq) + + if not np.all(self.mask): + raise Exception("Need to implement this!") + else: + from scipy.stats import multivariate_normal + return multivariate_normal(mu_x, Sigma_x).logpdf(self.X) ## Gibbs def resample(self): @@ -142,6 +152,7 @@ class FactorAnalysisStates(object): class _FactorAnalysisBase(Model): __metaclass__ = abc.ABCMeta + _states_class = FactorAnalysisStates def __init__(self, D_obs, D_latent, W=None, sigmasq=None, @@ -169,8 +180,8 @@ class _FactorAnalysisBase(Model): def sigmasq(self): return self.regression.sigmasq_flat - def add_data(self, data, mask=None): - self.data_list.append(FactorAnalysisStates(self, data, mask=mask)) + def add_data(self, data, mask=None, **kwargs): + self.data_list.append(self._states_class(self, data, mask=mask, **kwargs)) return self.data_list[-1] def generate(self, keep=True, N=1, mask=None, **kwargs): @@ -179,14 +190,20 @@ class _FactorAnalysisBase(Model): Z = np.random.randn(N, self.D_latent) X = np.dot(Z, W.T) + np.sqrt(sigmasq) * np.random.randn(N, self.D_obs) - data = FactorAnalysisStates(self, X, mask=mask) + data = self._states_class(self, X, mask=mask, **kwargs) data.Z = Z if keep: self.data_list.append(data) - return data + return data.X, data.Z + + def _log_likelihoods(self, x, mask=None, **kwargs): + self.add_data(x, mask=mask, **kwargs) + states = self.data_list.pop() + return states.log_likelihood() def log_likelihood(self): - return np.sum([d.log_likelihood() for d in self.data_list]) + return sum([d.log_likelihood().sum() for d in self.data_list]) + def log_probability(self): lp = 0 diff --git a/pybasicbayes/models/mixture.py b/pybasicbayes/models/mixture.py index 1675f01299e1d024cd4f2228d7d3dfdb4d82b7ca..43461c4a7c78ae06921d73ac8c2d4f188700311d 100644 --- a/pybasicbayes/models/mixture.py +++ b/pybasicbayes/models/mixture.py @@ -477,7 +477,7 @@ class Mixture(ModelGibbsSampling, ModelMeanField, ModelEM, ModelParallelTemperin [l.expectations[:,idx] for l in self.labels_list]) # mixture weights - self.weights.max_likelihood(np.arange(len(self.components)), + self.weights.max_likelihood(None, [l.expectations for l in self.labels_list]) @property diff --git a/pybasicbayes/util/stats.py b/pybasicbayes/util/stats.py index 7b704570de5102826762250e7d201df33aa38ffa..9ea9b479a449c970aaffc7cab0e6f1a124461c6d 100644 --- a/pybasicbayes/util/stats.py +++ b/pybasicbayes/util/stats.py @@ -317,6 +317,7 @@ def invwishart_entropy(sigma,nu,chol=None): return invwishart_log_partitionfunction(sigma,nu,chol)-(nu-D-1)/2*Elogdetlmbda + nu*D/2 def invwishart_log_partitionfunction(sigma,nu,chol=None): + # In Bishop B.79 notation, this is -log B(W, nu), where W = sigma^{-1} D = sigma.shape[0] chol = np.linalg.cholesky(sigma) if chol is None else chol return -1*(nu*np.log(chol.diagonal()).sum() - (nu*D/2*np.log(2) + D*(D-1)/4*np.log(np.pi) \ diff --git a/setup.py b/setup.py index 0824dbfeb3fad3249d527a91fc3bb23a003a623f..360f189fb7b787b8a8cf7c0e4bc0768bd19cbfb2 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ if use_cython: warn('Failed to generate extension module code from Cython files') setup(name='pybasicbayes', - version='0.2.2', + version='0.2.4', description="Basic utilities for Bayesian inference", author='Matthew James Johnson', author_email='mattjj@csail.mit.edu',