diff --git a/examples/robust_regression.py b/examples/robust_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..08d4d3112264957b58f461d65269fe7ddf7ff4e6 --- /dev/null +++ b/examples/robust_regression.py @@ -0,0 +1,132 @@ +# Demo of a robust regression model with multivariate-t distributed noise + +import numpy as np +import numpy.random as npr +np.random.seed(0) + +import matplotlib.pyplot as plt +import seaborn as sns +sns.set_style("white") + +from pybasicbayes.util.text import progprint_xrange +from pybasicbayes.distributions import Regression, RobustRegression + +D_out = 1 +D_in = 2 +N = 100 + +# Make a regression model and simulate data +A = npr.randn(D_out, D_in) +b = npr.randn(D_out) +Sigma = 0.1 * np.eye(D_out) + +true_reg = Regression(A=np.column_stack((A, b)), sigma=Sigma, affine=True) +X = npr.randn(N, D_in) +y = true_reg.rvs(x=X, return_xy=False) + +# Corrupt a fraction of the data +inds = npr.rand(N) < 0.1 +y[inds] = 3 * npr.randn(inds.sum(), D_out) + +# Make a test regression and fit it +std_reg = Regression(nu_0=D_out + 2, + S_0=np.eye(D_out), + M_0=np.zeros((D_out, D_in+1)), + K_0=np.eye(D_in+1), + affine=True) + +robust_reg = RobustRegression(nu_0=D_out+2, + S_0=np.eye(D_out), + M_0=np.zeros((D_out, D_in+1)), + K_0=np.eye(D_in+1), + affine=True) + +def _collect(r): + ll = r.log_likelihood((X, y))[~inds].sum() + err = ((y - r.predict(X))**2).sum(1) + mse = np.mean(err[~inds]) + return r.A.copy(), ll, mse + +def _update(r): + r.resample([(X,y)]) + return _collect(r) + +# Fit the standard regression +smpls = [_collect(std_reg)] +for _ in progprint_xrange(100): + smpls.append(_update(std_reg)) +smpls = zip(*smpls) +std_As, std_lls, std_mses = tuple(map(np.array, smpls)) + +# Fit the robust regression +smpls = [_collect(robust_reg)] +for _ in progprint_xrange(100): + smpls.append(_update(robust_reg)) +smpls = zip(*smpls) +robust_As, robust_lls, robust_mses = tuple(map(np.array, smpls)) + + +# Plot the inferred regression function +plt.figure(figsize=(8, 4)) +xlim = (-3, 3) +ylim = abs(y).max() +npts = 50 +x1, x2 = np.meshgrid(np.linspace(*xlim, npts), np.linspace(*xlim, npts)) + +plt.subplot(131) +mu = true_reg.predict(np.column_stack((x1.ravel(), x2.ravel()))) +plt.imshow(mu.reshape((npts, npts)), + cmap="RdBu", vmin=-ylim, vmax=ylim, + alpha=0.8, + extent=xlim + tuple(reversed(xlim))) +plt.scatter(X[~inds,0], X[~inds,1], c=y[~inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='gray') +plt.scatter(X[inds,0], X[inds,1], c=y[inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='k', linewidths=1) +plt.xlim(xlim) +plt.ylim(xlim) +plt.title("True") + +plt.subplot(132) +mu = std_reg.predict(np.column_stack((x1.ravel(), x2.ravel()))) +plt.imshow(mu.reshape((npts, npts)), + cmap="RdBu", vmin=-ylim, vmax=ylim, + alpha=0.8, + extent=xlim + tuple(reversed(xlim))) +plt.scatter(X[~inds,0], X[~inds,1], c=y[~inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='gray') +plt.scatter(X[inds,0], X[inds,1], c=y[inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='k', linewidths=1) +plt.xlim(xlim) +plt.ylim(xlim) +plt.title("Standard Regression") + +plt.subplot(133) +mu = robust_reg.predict(np.column_stack((x1.ravel(), x2.ravel()))) +plt.imshow(mu.reshape((npts, npts)), + cmap="RdBu", vmin=-ylim, vmax=ylim, + alpha=0.8, + extent=xlim + tuple(reversed(xlim))) +plt.scatter(X[~inds,0], X[~inds,1], c=y[~inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='gray') +plt.scatter(X[inds,0], X[inds,1], c=y[inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='k', linewidths=1) +plt.xlim(xlim) +plt.ylim(xlim) +plt.title("Robust Regression") + + +print("True A: {}".format(true_reg.A)) +print("Std A: {}".format(std_As.mean(0))) +print("Robust A: {}".format(robust_As.mean(0))) + +# Plot the log likelihoods and mean squared errors +plt.figure(figsize=(8, 4)) +plt.subplot(121) +plt.plot(std_lls) +plt.plot(robust_lls) +plt.xlabel("Iteration") +plt.ylabel("Log Likelihood") + +plt.subplot(122) +plt.plot(std_mses, label="Standard") +plt.plot(robust_mses, label="Robust") +plt.legend(loc="upper right") +plt.xlabel("Iteration") +plt.ylabel("Mean Squared Error") + +plt.show() diff --git a/pybasicbayes/distributions/regression.py b/pybasicbayes/distributions/regression.py index 5ae35abadf178c13227fdd7187b9e1c09100b46a..46e615783bfb73e49a90bd25b72110ca6fcbcbf0 100644 --- a/pybasicbayes/distributions/regression.py +++ b/pybasicbayes/distributions/regression.py @@ -2,11 +2,17 @@ from __future__ import division from builtins import zip from builtins import range __all__ = ['Regression', 'RegressionNonconj', 'ARDRegression', - 'AutoRegression', 'ARDAutoRegression', 'DiagonalRegression'] + 'AutoRegression', 'ARDAutoRegression', 'DiagonalRegression', + 'RobustRegression', 'RobustAutoRegression'] + +from warnings import warn import numpy as np from numpy import newaxis as na +from scipy.linalg import solve_triangular +from scipy.special import gammaln, digamma, polygamma + from pybasicbayes.abstractions import GibbsSampling, MaxLikelihood, \ MeanField, MeanFieldSVI from pybasicbayes.util.stats import sample_gaussian, sample_mniw, \ @@ -86,12 +92,15 @@ class Regression(GibbsSampling, MeanField, MaxLikelihood): @staticmethod def _natural_to_standard(natparam): - A,B,C,d = natparam + A,B,C,d = natparam # natparam is roughly (yyT, yxT, xxT, n) nu = d Kinv = C K = inv_psd(Kinv) - M = B.dot(K) - S = A - B.dot(K).dot(B.T) + # M = B.dot(K) + M = np.linalg.solve(Kinv, B.T).T + # This subtraction seems unstable! + # It does not necessarily return a PSD matrix + S = A - M.dot(B.T) # numerical padding here... K += 1e-8*np.eye(K.shape[0]) @@ -99,6 +108,7 @@ class Regression(GibbsSampling, MeanField, MaxLikelihood): assert np.all(0 < np.linalg.eigvalsh(S)) assert np.all(0 < np.linalg.eigvalsh(K)) + # standard is degrees of freedom, mean of sigma (ish), mean of A, cov of rows of A return nu, S, M, K ### getting statistics @@ -898,6 +908,248 @@ class DiagonalRegression(Regression, MeanFieldSVI): self._meanfieldupdate_sigma(stats, prob=prob, stepsize=stepsize) +class RobustRegression(Regression): + """ + Regression with multivariate-t distributed noise. + + y | x ~ t(Ax + b, \Sigma, \nu) + + where \nu >= 1 is the degrees of freedom. + + This is equivalent to the model, + + \tau ~ Gamma(\nu/2, \nu/2) + y | x, \tau ~ N(Ax + b, \Sigma / \tau) + + To perform inference in this model, we will introduce + auxiliary variables tau (precisions). With these, we + can compute sufficient statistics scaled by \tau and + use the standard regression object to + update A, b, Sigma | x, y, \tau. + + The degrees of freedom parameter \nu is updated via maximum + likelihood using a generalized Newton's method proposed by + Tom Minka. We are not using any prior on \nu, but we + could experiment with updating \nu under an + uninformative prior, e.g. p(\nu) \propto \nu^{-2}, + which is equivalent to a flat prior on \nu^{-1}. + """ + def __init__( + self, nu_0=None,S_0=None, M_0=None, K_0=None, affine=False, + A=None, sigma=None, nu=None): + + # Default to a somewhat intermediate value of nu + self.nu = nu if nu is not None else 4 + + super(RobustRegression, self).__init__( + nu_0=nu_0, S_0=S_0, M_0=M_0, K_0=K_0, affine=affine, A=A, sigma=sigma) + + def log_likelihood(self,xy): + assert isinstance(xy, (tuple, np.ndarray)) + sigma, D, nu = self.sigma, self.D_out, self.nu + x, y = (xy[:,:-D], xy[:,-D:]) if isinstance(xy,np.ndarray) else xy + + sigma_inv, L = inv_psd(sigma, return_chol=True) + r = y - self.predict(x) + z = sigma_inv.dot(r.T).T + + out = -0.5 * (nu + D) * np.log(1.0 + (r * z).sum(1) / nu) + out += gammaln((nu + D) / 2.0) - gammaln(nu / 2.0) - D / 2.0 * np.log(nu) \ + - D / 2.0 * np.log(np.pi) - np.log(np.diag(L)).sum() + + return out + + def rvs(self,x=None,size=1,return_xy=True): + A, sigma, nu, D = self.A, self.sigma, self.nu, self.D_out + + if self.affine: + A, b = A[:,:-1], A[:,-1] + + x = np.random.normal(size=(size, A.shape[1])) if x is None else x + N = x.shape[0] + mu = self.predict(x) + + # Sample precisions and t-distributed residuals + tau = np.random.gamma(nu / 2.0, 2.0 / nu, size=(N,)) + resid = np.random.randn(N, D).dot(np.linalg.cholesky(sigma).T) + resid /= np.sqrt(tau[:, None]) + + y = mu + resid + return np.hstack((x,y)) if return_xy else y + + def _get_statistics(self, data): + raise Exception("RobustRegression needs scaled statistics.") + + def _get_scaled_statistics(self, data, precisions): + assert isinstance(data, (list, tuple, np.ndarray)) + if isinstance(data,list): + return sum((self._get_scaled_statistics(d, p) for d, p in zip(data, precisions)), + self._empty_statistics()) + + elif isinstance(data, tuple): + x, y = data + bad = np.isnan(x).any(1) | np.isnan(y).any(1) + x, y = x[~bad], y[~bad] + precisions = precisions[~bad] + sqrt_prec = np.sqrt(precisions) + n, D = y.shape + + if self.affine: + x = np.column_stack((x, np.ones(n))) + + # Scale by the precision + # xs = x * sqrt_prec[:, na] + # ys = y * sqrt_prec[:, na] + xs = x * np.tile(sqrt_prec[:, None], (1, x.shape[1])) + ys = y * np.tile(sqrt_prec[:, None], (1, D)) + + xxT, yxT, yyT = xs.T.dot(xs), ys.T.dot(xs), ys.T.dot(ys) + return np.array([yyT, yxT, xxT, n]) + + else: + # data passed in like np.hstack((x, y)) + # x, y = data[:,:-self.D_out], data[:,-self.D_out:] + # return self._get_scaled_statistics((x, y), precisions) + bad = np.isnan(data).any(1) + data = data[~bad] + precisions = precisions[~bad] + n, D = data.shape[0], self.D_out + + # This tile call is suboptimal but without it we can hit issues + # with strided data, as in autoregressive models. + scaled_data = data * np.tile(precisions[:,None], (1, data.shape[1])) + statmat = scaled_data.T.dot(data) + + xxT, yxT, yyT = \ + statmat[:-D,:-D], statmat[-D:,:-D], statmat[-D:,-D:] + + if self.affine: + xy = scaled_data.sum(0) + x, y = xy[:-D], xy[-D:] + xxT = blockarray([[xxT, x[:,na]], + [x[na,:], np.atleast_2d(precisions.sum())]]) + yxT = np.hstack((yxT, y[:,na])) + + return np.array([yyT, yxT, xxT, n]) + + def resample(self, data=[], stats=None): + assert stats is None, \ + "We only support RobustRegression.resample() with data, not stats." + + # First sample auxiliary variables for each data point + tau = self._resample_precision(data) + + # Compute statistics, scaling by tau, and resample as in standard Regression + stats = self._get_scaled_statistics(data, tau) + super(RobustRegression, self).resample(stats=stats) + + # Resample degrees of freedom \nu + self._resample_nu(tau) + + def _resample_precision(self, data): + assert isinstance(data, (list, tuple, np.ndarray)) + if isinstance(data, list): + return [self._resample_precision(d) for d in data] + + elif isinstance(data, tuple): + x, y = data + + else: + x, y = data[:, :-self.D_out], data[:, -self.D_out:] + + assert x.ndim == y.ndim == 2 + assert x.shape[0] == y.shape[0] + assert x.shape[1] == self.D_in - 1 if self.affine else self.D_in + assert y.shape[1] == self.D_out + N = x.shape[0] + + # Weed out the nan's + bad = np.any(np.isnan(x), axis=1) | np.any(np.isnan(y), axis=1) + + # Compute posterior params of gamma distribution + a_post = self.nu / 2.0 + self.D_out / 2.0 + + r = y - self.predict(x) + sigma_inv = inv_psd(self.sigma) + z = sigma_inv.dot(r.T).T + b_post = self.nu / 2.0 + (r * z).sum(1) / 2.0 + + assert np.isscalar(a_post) and b_post.shape == (N,) + tau = np.zeros(N) + tau[~bad] = np.random.gamma(a_post, 1./b_post[~bad]) + + return tau + + def _resample_nu(self, tau, nu0=4, max_iter=100, nu_min=1e-3, nu_max=100, tol=1e-4, verbose=False): + """ + Generalized Newton's method for the degrees of freedom parameter, nu, + of a Student's t distribution. See the notebook in the doc/students_t + folder for a complete derivation. + """ + if len(tau) == 0: + self.nu = nu0 + return + + tau = np.concatenate(tau) if isinstance(tau, list) else tau + E_tau = np.mean(tau) + E_logtau = np.mean(np.log(tau)) + + from scipy.special import digamma, polygamma + delbo = lambda nu: 1/2 * (1 + np.log(nu/2)) - 1/2 * digamma(nu/2) + 1/2 * E_logtau - 1/2 * E_tau + ddelbo = lambda nu: 1/(2 * nu) - 1/4 * polygamma(1, nu/2) + + dnu = np.inf + nu = nu0 + for itr in range(max_iter): + if abs(dnu) < tol: + break + + if nu < nu_min or nu > nu_max: + warn("generalized_newton_studentst_dof fixed point grew beyond " + "bounds [{},{}].".format(nu_min, nu_max)) + nu = np.clip(nu, nu_min, nu_max) + break + + # Perform the generalized Newton update + a = -nu**2 * ddelbo(nu) + b = delbo(nu) - a / nu + assert a > 0 and b < 0, "generalized_newton_studentst_dof encountered invalid values of a,b" + dnu = -a / b - nu + nu = nu + dnu + + if itr == max_iter - 1: + warn("generalized_newton_studentst_dof failed to converge" + "at tolerance {} in {} iterations.".format(tol, itr)) + + self.nu = nu + + # Not supporting MLE or mean field for now + def max_likelihood(self,data,weights=None,stats=None): + raise NotImplementedError + + def meanfieldupdate(self, data=None, weights=None, stats=None): + raise NotImplementedError + + def meanfield_sgdstep(self, data, weights, prob, stepsize, stats=None): + raise NotImplementedError + + def meanfield_expectedstats(self): + raise NotImplementedError + + def expected_log_likelihood(self, xy=None, stats=None): + raise NotImplementedError + + def get_vlb(self): + raise NotImplementedError + + def resample_from_mf(self): + raise NotImplementedError + + def _set_params_from_mf(self): + raise NotImplementedError + + def _initialize_mean_field(self): + pass class _ARMixin(object): @@ -913,7 +1165,7 @@ class _ARMixin(object): return self.D_out def predict(self, x): - return super(_ARMixin,self).predict(np.atleast_2d(x.ravel())) + return super(_ARMixin,self).predict(np.atleast_2d(x)) def rvs(self,lagged_data): return super(_ARMixin,self).rvs( @@ -949,3 +1201,7 @@ class ARDAutoRegression(_ARMixin,ARDRegression): + ([1] if M_0.shape[1] % M_0.shape[0] and M_0.shape[0] != 1 else []) super(ARDAutoRegression,self).__init__( M_0=M_0,blocksizes=blocksizes,**kwargs) + + +class RobustAutoRegression(_ARMixin, RobustRegression): + pass diff --git a/pybasicbayes/models/factor_analysis.py b/pybasicbayes/models/factor_analysis.py index 03c352bb9b37ca5fc49e2029b380ff33c0697a14..23aa648611b11b7e1a6a0a63c175eff98e01d1eb 100644 --- a/pybasicbayes/models/factor_analysis.py +++ b/pybasicbayes/models/factor_analysis.py @@ -58,8 +58,18 @@ class FactorAnalysisStates(object): def log_likelihood(self): - mu = np.dot(self.Z, self.W.T) - return -0.5 * np.sum(((self.X - mu) * self.mask) ** 2 / self.sigmasq) + # mu = np.dot(self.Z, self.W.T) + # return -0.5 * np.sum(((self.X - mu) * self.mask) ** 2 / self.sigmasq) + + # Compute the marginal likelihood, integrating out z + mu_x = np.zeros(self.D_obs) + Sigma_x = self.W.dot(self.W.T) + np.diag(self.sigmasq) + + if not np.all(self.mask): + raise Exception("Need to implement this!") + else: + from scipy.stats import multivariate_normal + return multivariate_normal(mu_x, Sigma_x).logpdf(self.X) ## Gibbs def resample(self): @@ -184,10 +194,16 @@ class _FactorAnalysisBase(Model): data.Z = Z if keep: self.data_list.append(data) - return data + return data.X, data.Z + + def _log_likelihoods(self, x, mask=None, **kwargs): + self.add_data(x, mask=mask, **kwargs) + states = self.data_list.pop() + return states.log_likelihood() def log_likelihood(self): - return np.sum([d.log_likelihood() for d in self.data_list]) + return sum([d.log_likelihood().sum() for d in self.data_list]) + def log_probability(self): lp = 0