diff --git a/examples/robust_regression.py b/examples/robust_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..08d4d3112264957b58f461d65269fe7ddf7ff4e6
--- /dev/null
+++ b/examples/robust_regression.py
@@ -0,0 +1,132 @@
+# Demo of a robust regression model with multivariate-t distributed noise
+
+import numpy as np
+import numpy.random as npr
+np.random.seed(0)
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+sns.set_style("white")
+
+from pybasicbayes.util.text import progprint_xrange
+from pybasicbayes.distributions import Regression, RobustRegression
+
+D_out = 1
+D_in = 2
+N = 100
+
+# Make a regression model and simulate data
+A = npr.randn(D_out, D_in)
+b = npr.randn(D_out)
+Sigma = 0.1 * np.eye(D_out)
+
+true_reg = Regression(A=np.column_stack((A, b)), sigma=Sigma, affine=True)
+X = npr.randn(N, D_in)
+y = true_reg.rvs(x=X, return_xy=False)
+
+# Corrupt a fraction of the data
+inds = npr.rand(N) < 0.1
+y[inds] = 3 * npr.randn(inds.sum(), D_out)
+
+# Make a test regression and fit it
+std_reg = Regression(nu_0=D_out + 2,
+                     S_0=np.eye(D_out),
+                     M_0=np.zeros((D_out, D_in+1)),
+                     K_0=np.eye(D_in+1),
+                     affine=True)
+
+robust_reg = RobustRegression(nu_0=D_out+2,
+                      S_0=np.eye(D_out),
+                      M_0=np.zeros((D_out, D_in+1)),
+                      K_0=np.eye(D_in+1),
+                      affine=True)
+
+def _collect(r):
+    ll = r.log_likelihood((X, y))[~inds].sum()
+    err = ((y - r.predict(X))**2).sum(1)
+    mse = np.mean(err[~inds])
+    return r.A.copy(), ll, mse
+
+def _update(r):
+    r.resample([(X,y)])
+    return _collect(r)
+
+# Fit the standard regression
+smpls = [_collect(std_reg)]
+for _ in progprint_xrange(100):
+    smpls.append(_update(std_reg))
+smpls = zip(*smpls)
+std_As, std_lls, std_mses = tuple(map(np.array, smpls))
+
+# Fit the robust regression
+smpls = [_collect(robust_reg)]
+for _ in progprint_xrange(100):
+    smpls.append(_update(robust_reg))
+smpls = zip(*smpls)
+robust_As, robust_lls, robust_mses = tuple(map(np.array, smpls))
+
+
+# Plot the inferred regression function
+plt.figure(figsize=(8, 4))
+xlim = (-3, 3)
+ylim = abs(y).max()
+npts = 50
+x1, x2 = np.meshgrid(np.linspace(*xlim, npts), np.linspace(*xlim, npts))
+
+plt.subplot(131)
+mu = true_reg.predict(np.column_stack((x1.ravel(), x2.ravel())))
+plt.imshow(mu.reshape((npts, npts)),
+           cmap="RdBu", vmin=-ylim, vmax=ylim,
+           alpha=0.8,
+           extent=xlim + tuple(reversed(xlim)))
+plt.scatter(X[~inds,0], X[~inds,1], c=y[~inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='gray')
+plt.scatter(X[inds,0], X[inds,1], c=y[inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='k', linewidths=1)
+plt.xlim(xlim)
+plt.ylim(xlim)
+plt.title("True")
+
+plt.subplot(132)
+mu = std_reg.predict(np.column_stack((x1.ravel(), x2.ravel())))
+plt.imshow(mu.reshape((npts, npts)),
+           cmap="RdBu", vmin=-ylim, vmax=ylim,
+           alpha=0.8,
+           extent=xlim + tuple(reversed(xlim)))
+plt.scatter(X[~inds,0], X[~inds,1], c=y[~inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='gray')
+plt.scatter(X[inds,0], X[inds,1], c=y[inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='k', linewidths=1)
+plt.xlim(xlim)
+plt.ylim(xlim)
+plt.title("Standard Regression")
+
+plt.subplot(133)
+mu = robust_reg.predict(np.column_stack((x1.ravel(), x2.ravel())))
+plt.imshow(mu.reshape((npts, npts)),
+           cmap="RdBu", vmin=-ylim, vmax=ylim,
+           alpha=0.8,
+           extent=xlim + tuple(reversed(xlim)))
+plt.scatter(X[~inds,0], X[~inds,1], c=y[~inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='gray')
+plt.scatter(X[inds,0], X[inds,1], c=y[inds, 0], cmap="RdBu", vmin=-ylim, vmax=ylim, edgecolors='k', linewidths=1)
+plt.xlim(xlim)
+plt.ylim(xlim)
+plt.title("Robust Regression")
+
+
+print("True A:   {}".format(true_reg.A))
+print("Std A:    {}".format(std_As.mean(0)))
+print("Robust A: {}".format(robust_As.mean(0)))
+
+# Plot the log likelihoods and mean squared errors
+plt.figure(figsize=(8, 4))
+plt.subplot(121)
+plt.plot(std_lls)
+plt.plot(robust_lls)
+plt.xlabel("Iteration")
+plt.ylabel("Log Likelihood")
+
+plt.subplot(122)
+plt.plot(std_mses, label="Standard")
+plt.plot(robust_mses, label="Robust")
+plt.legend(loc="upper right")
+plt.xlabel("Iteration")
+plt.ylabel("Mean Squared Error")
+
+plt.show()
diff --git a/pybasicbayes/distributions/regression.py b/pybasicbayes/distributions/regression.py
index 5ae35abadf178c13227fdd7187b9e1c09100b46a..46e615783bfb73e49a90bd25b72110ca6fcbcbf0 100644
--- a/pybasicbayes/distributions/regression.py
+++ b/pybasicbayes/distributions/regression.py
@@ -2,11 +2,17 @@ from __future__ import division
 from builtins import zip
 from builtins import range
 __all__ = ['Regression', 'RegressionNonconj', 'ARDRegression',
-           'AutoRegression', 'ARDAutoRegression', 'DiagonalRegression']
+           'AutoRegression', 'ARDAutoRegression', 'DiagonalRegression',
+           'RobustRegression', 'RobustAutoRegression']
+
+from warnings import warn
 
 import numpy as np
 from numpy import newaxis as na
 
+from scipy.linalg import solve_triangular
+from scipy.special import gammaln, digamma, polygamma
+
 from pybasicbayes.abstractions import GibbsSampling, MaxLikelihood, \
     MeanField, MeanFieldSVI
 from pybasicbayes.util.stats import sample_gaussian, sample_mniw, \
@@ -86,12 +92,15 @@ class Regression(GibbsSampling, MeanField, MaxLikelihood):
 
     @staticmethod
     def _natural_to_standard(natparam):
-        A,B,C,d = natparam
+        A,B,C,d = natparam   # natparam is roughly (yyT, yxT, xxT, n)
         nu = d
         Kinv = C
         K = inv_psd(Kinv)
-        M = B.dot(K)
-        S = A - B.dot(K).dot(B.T)
+        # M = B.dot(K)
+        M = np.linalg.solve(Kinv, B.T).T
+        # This subtraction seems unstable!
+        # It does not necessarily return a PSD matrix
+        S = A - M.dot(B.T)
 
         # numerical padding here...
         K += 1e-8*np.eye(K.shape[0])
@@ -99,6 +108,7 @@ class Regression(GibbsSampling, MeanField, MaxLikelihood):
         assert np.all(0 < np.linalg.eigvalsh(S))
         assert np.all(0 < np.linalg.eigvalsh(K))
 
+        # standard is degrees of freedom, mean of sigma (ish), mean of A, cov of rows of A
         return nu, S, M, K
 
     ### getting statistics
@@ -898,6 +908,248 @@ class DiagonalRegression(Regression, MeanFieldSVI):
         self._meanfieldupdate_sigma(stats, prob=prob, stepsize=stepsize)
 
 
+class RobustRegression(Regression):
+    """
+    Regression with multivariate-t distributed noise.
+
+        y | x ~ t(Ax + b, \Sigma, \nu)
+
+    where \nu >= 1 is the degrees of freedom.
+
+    This is equivalent to the model,
+
+        \tau ~ Gamma(\nu/2,  \nu/2)
+        y | x, \tau ~ N(Ax + b, \Sigma / \tau)
+
+    To perform inference in this model, we will introduce
+    auxiliary variables tau (precisions).  With these, we
+    can compute sufficient statistics scaled by \tau and
+    use the standard regression object to
+    update A, b, Sigma | x, y, \tau.
+
+    The degrees of freedom parameter \nu is updated via maximum
+    likelihood using a generalized Newton's method proposed by
+    Tom Minka.  We are not using any prior on \nu, but we 
+    could experiment with updating \nu under an
+    uninformative prior, e.g. p(\nu) \propto \nu^{-2},
+    which is equivalent to a flat prior on \nu^{-1}.
+    """
+    def __init__(
+            self, nu_0=None,S_0=None, M_0=None, K_0=None, affine=False,
+            A=None, sigma=None, nu=None):
+
+        # Default to a somewhat intermediate value of nu
+        self.nu = nu if nu is not None else 4
+
+        super(RobustRegression, self).__init__(
+            nu_0=nu_0, S_0=S_0, M_0=M_0, K_0=K_0, affine=affine, A=A, sigma=sigma)
+
+    def log_likelihood(self,xy):
+        assert isinstance(xy, (tuple, np.ndarray))
+        sigma, D, nu = self.sigma, self.D_out, self.nu
+        x, y = (xy[:,:-D], xy[:,-D:]) if isinstance(xy,np.ndarray) else xy
+
+        sigma_inv, L = inv_psd(sigma, return_chol=True)
+        r = y - self.predict(x)
+        z = sigma_inv.dot(r.T).T
+
+        out = -0.5 * (nu + D) * np.log(1.0 + (r * z).sum(1) / nu)
+        out += gammaln((nu + D) / 2.0) - gammaln(nu / 2.0) - D / 2.0 * np.log(nu) \
+            - D / 2.0 * np.log(np.pi) - np.log(np.diag(L)).sum()
+
+        return out
+
+    def rvs(self,x=None,size=1,return_xy=True):
+        A, sigma, nu, D = self.A, self.sigma, self.nu, self.D_out
+
+        if self.affine:
+            A, b = A[:,:-1], A[:,-1]
+
+        x = np.random.normal(size=(size, A.shape[1])) if x is None else x
+        N = x.shape[0]
+        mu = self.predict(x)
+
+        # Sample precisions and t-distributed residuals
+        tau = np.random.gamma(nu / 2.0, 2.0 / nu, size=(N,))
+        resid = np.random.randn(N, D).dot(np.linalg.cholesky(sigma).T)
+        resid /= np.sqrt(tau[:, None])
+
+        y = mu + resid
+        return np.hstack((x,y)) if return_xy else y
+
+    def _get_statistics(self, data):
+        raise Exception("RobustRegression needs scaled statistics.")
+
+    def _get_scaled_statistics(self, data, precisions):
+        assert isinstance(data, (list, tuple, np.ndarray))
+        if isinstance(data,list):
+            return sum((self._get_scaled_statistics(d, p) for d, p in zip(data, precisions)),
+                       self._empty_statistics())
+
+        elif isinstance(data, tuple):
+            x, y = data
+            bad = np.isnan(x).any(1) | np.isnan(y).any(1)
+            x, y = x[~bad], y[~bad]
+            precisions = precisions[~bad]
+            sqrt_prec = np.sqrt(precisions)
+            n, D = y.shape
+
+            if self.affine:
+                x = np.column_stack((x, np.ones(n)))
+
+            # Scale by the precision
+            # xs = x * sqrt_prec[:, na]
+            # ys = y * sqrt_prec[:, na]
+            xs = x * np.tile(sqrt_prec[:, None], (1, x.shape[1]))
+            ys = y * np.tile(sqrt_prec[:, None], (1, D))
+
+            xxT, yxT, yyT = xs.T.dot(xs), ys.T.dot(xs), ys.T.dot(ys)
+            return np.array([yyT, yxT, xxT, n])
+
+        else:
+            # data passed in like np.hstack((x, y))
+            # x, y = data[:,:-self.D_out], data[:,-self.D_out:]
+            # return self._get_scaled_statistics((x, y), precisions)
+            bad = np.isnan(data).any(1)
+            data = data[~bad]
+            precisions = precisions[~bad]
+            n, D = data.shape[0], self.D_out
+
+            # This tile call is suboptimal but without it we can hit issues
+            # with strided data, as in autoregressive models.
+            scaled_data = data * np.tile(precisions[:,None], (1, data.shape[1]))
+            statmat = scaled_data.T.dot(data)
+
+            xxT, yxT, yyT = \
+                statmat[:-D,:-D], statmat[-D:,:-D], statmat[-D:,-D:]
+
+            if self.affine:
+                xy = scaled_data.sum(0)
+                x, y = xy[:-D], xy[-D:]
+                xxT = blockarray([[xxT,     x[:,na]],
+                                  [x[na,:], np.atleast_2d(precisions.sum())]])
+                yxT = np.hstack((yxT, y[:,na]))
+
+            return np.array([yyT, yxT, xxT, n])
+
+    def resample(self, data=[], stats=None):
+        assert stats is None, \
+            "We only support RobustRegression.resample() with data, not stats."
+
+        # First sample auxiliary variables for each data point
+        tau = self._resample_precision(data)
+
+        # Compute statistics, scaling by tau, and resample as in standard Regression
+        stats = self._get_scaled_statistics(data, tau)
+        super(RobustRegression, self).resample(stats=stats)
+
+        # Resample degrees of freedom \nu
+        self._resample_nu(tau)
+
+    def _resample_precision(self, data):
+        assert isinstance(data, (list, tuple, np.ndarray))
+        if isinstance(data, list):
+            return [self._resample_precision(d) for d in data]
+
+        elif isinstance(data, tuple):
+            x, y = data
+
+        else:
+            x, y = data[:, :-self.D_out], data[:, -self.D_out:]
+
+        assert x.ndim == y.ndim == 2
+        assert x.shape[0] == y.shape[0]
+        assert x.shape[1] == self.D_in - 1 if self.affine else self.D_in
+        assert y.shape[1] == self.D_out
+        N = x.shape[0]
+
+        # Weed out the nan's
+        bad = np.any(np.isnan(x), axis=1) | np.any(np.isnan(y), axis=1)
+
+        # Compute posterior params of gamma distribution
+        a_post = self.nu / 2.0 + self.D_out / 2.0
+
+        r = y - self.predict(x)
+        sigma_inv = inv_psd(self.sigma)
+        z = sigma_inv.dot(r.T).T
+        b_post = self.nu / 2.0 + (r * z).sum(1) / 2.0
+
+        assert np.isscalar(a_post) and b_post.shape == (N,)
+        tau = np.zeros(N)
+        tau[~bad] = np.random.gamma(a_post, 1./b_post[~bad])
+
+        return tau
+
+    def _resample_nu(self, tau, nu0=4, max_iter=100, nu_min=1e-3, nu_max=100, tol=1e-4, verbose=False):
+        """
+        Generalized Newton's method for the degrees of freedom parameter, nu, 
+        of a Student's t distribution.  See the notebook in the doc/students_t
+        folder for a complete derivation. 
+        """
+        if len(tau) == 0:
+            self.nu = nu0
+            return
+
+        tau = np.concatenate(tau) if isinstance(tau, list) else tau
+        E_tau = np.mean(tau)
+        E_logtau = np.mean(np.log(tau))
+
+        from scipy.special import digamma, polygamma
+        delbo = lambda nu: 1/2 * (1 + np.log(nu/2)) - 1/2 * digamma(nu/2) + 1/2 * E_logtau - 1/2 * E_tau
+        ddelbo = lambda nu: 1/(2 * nu) - 1/4 * polygamma(1, nu/2)
+
+        dnu = np.inf
+        nu = nu0
+        for itr in range(max_iter):
+            if abs(dnu) < tol:
+                break
+                
+            if nu < nu_min or nu > nu_max:
+                warn("generalized_newton_studentst_dof fixed point grew beyond "
+                     "bounds [{},{}].".format(nu_min, nu_max))
+                nu = np.clip(nu, nu_min, nu_max)
+                break
+            
+            # Perform the generalized Newton update
+            a = -nu**2 * ddelbo(nu)
+            b = delbo(nu) - a / nu
+            assert a > 0 and b < 0, "generalized_newton_studentst_dof encountered invalid values of a,b"
+            dnu = -a / b - nu
+            nu = nu + dnu
+
+        if itr == max_iter - 1:
+            warn("generalized_newton_studentst_dof failed to converge" 
+                 "at tolerance {} in {} iterations.".format(tol, itr))
+
+        self.nu = nu
+
+    # Not supporting MLE or mean field for now
+    def max_likelihood(self,data,weights=None,stats=None):
+        raise NotImplementedError
+
+    def meanfieldupdate(self, data=None, weights=None, stats=None):
+        raise NotImplementedError
+
+    def meanfield_sgdstep(self, data, weights, prob, stepsize, stats=None):
+        raise NotImplementedError
+
+    def meanfield_expectedstats(self):
+        raise NotImplementedError
+
+    def expected_log_likelihood(self, xy=None, stats=None):
+        raise NotImplementedError
+
+    def get_vlb(self):
+        raise NotImplementedError
+
+    def resample_from_mf(self):
+        raise NotImplementedError
+
+    def _set_params_from_mf(self):
+        raise NotImplementedError
+
+    def _initialize_mean_field(self):
+        pass
 
 
 class _ARMixin(object):
@@ -913,7 +1165,7 @@ class _ARMixin(object):
         return self.D_out
 
     def predict(self, x):
-        return super(_ARMixin,self).predict(np.atleast_2d(x.ravel()))
+        return super(_ARMixin,self).predict(np.atleast_2d(x))
 
     def rvs(self,lagged_data):
         return super(_ARMixin,self).rvs(
@@ -949,3 +1201,7 @@ class ARDAutoRegression(_ARMixin,ARDRegression):
                 + ([1] if M_0.shape[1] % M_0.shape[0] and M_0.shape[0] != 1 else [])
         super(ARDAutoRegression,self).__init__(
                 M_0=M_0,blocksizes=blocksizes,**kwargs)
+
+
+class RobustAutoRegression(_ARMixin, RobustRegression):
+    pass
diff --git a/pybasicbayes/models/factor_analysis.py b/pybasicbayes/models/factor_analysis.py
index 03c352bb9b37ca5fc49e2029b380ff33c0697a14..23aa648611b11b7e1a6a0a63c175eff98e01d1eb 100644
--- a/pybasicbayes/models/factor_analysis.py
+++ b/pybasicbayes/models/factor_analysis.py
@@ -58,8 +58,18 @@ class FactorAnalysisStates(object):
 
 
     def log_likelihood(self):
-        mu = np.dot(self.Z, self.W.T)
-        return -0.5 * np.sum(((self.X - mu) * self.mask) ** 2 / self.sigmasq)
+        # mu = np.dot(self.Z, self.W.T)
+        # return -0.5 * np.sum(((self.X - mu) * self.mask) ** 2 / self.sigmasq)
+
+        # Compute the marginal likelihood, integrating out z
+        mu_x = np.zeros(self.D_obs)
+        Sigma_x = self.W.dot(self.W.T) + np.diag(self.sigmasq)
+
+        if not np.all(self.mask):
+            raise Exception("Need to implement this!")
+        else:
+            from scipy.stats import multivariate_normal
+            return multivariate_normal(mu_x, Sigma_x).logpdf(self.X)
 
     ## Gibbs
     def resample(self):
@@ -184,10 +194,16 @@ class _FactorAnalysisBase(Model):
         data.Z = Z
         if keep:
             self.data_list.append(data)
-        return data
+        return data.X, data.Z
+
+    def _log_likelihoods(self, x, mask=None, **kwargs):
+        self.add_data(x, mask=mask, **kwargs)
+        states = self.data_list.pop()
+        return states.log_likelihood()
 
     def log_likelihood(self):
-        return np.sum([d.log_likelihood() for d in self.data_list])
+        return sum([d.log_likelihood().sum() for d in self.data_list])
+
 
     def log_probability(self):
         lp = 0