Compare revisions

SebastianBruijns · SebastianBruijns · 30d632a7
--- a/pybasicbayes/distributions/dynamic_glm.py
+++ b/pybasicbayes/distributions/dynamic_glm.py
@@ -9,7 +9,7 @@ __all__ = ['Dynamic_GLM']

 def local_multivariate_normal_draw(x, sigma, normal):
    """
-    Function to combine pre-drawn Normals (normal) with the desired mean x and variance sigma
+    Function to combine pre-drawn Normals (normal) with the desired mean x and variance sigma (cause calls to multivariate normal are too costly)
    Cholesky doesn't like 0 cov matrix, but we want it.

    This might need changing if in practice we see plently of 0 matrices
@@ -23,7 +23,7 @@ def local_multivariate_normal_draw(x, sigma, normal):
            print("Weird covariance matrix")
            quit()

-
+# fix seed of PyPolyaGamma, otherwise results aren't reproducible
 ppgsseed = 4
 if ppgsseed == 4:
    print("Using default seed")
@@ -52,7 +52,7 @@ class Dynamic_GLM(GibbsSampling):
        self.T = T
        self.jumplimit = jumplimit
        self.x_0 = prior_mean
-        self.P_0, self.Q = P_0, Q
+        self.P_0, self.Q = P_0, Q  # initial variance of regressors P_0, and variance for timesteps Q (can have different variances, we have it fixed)
        self.psi_diff_saves = []  # this can be used to resample the variance, but is currently unused
        self.noise_mean = np.zeros(self.n_regressors)  # save this, so as to not keep creating it
        self.identity = np.eye(self.n_regressors)  # not really needed, but kinda useful for state sampling
@@ -81,8 +81,11 @@ class Dynamic_GLM(GibbsSampling):
        return outputs

    def log_likelihood(self, input, timepoint):
+        """
+        Given input from a single session (a matrix containing features and responses) and the timepoint information from that session, return log-likelihoods for observations.
+        """
        predictors, responses = input[:, :-1], input[:, -1]
-        nans = np.isnan(responses)
+        nans = np.isnan(responses)  # can come from cross-validation
        probs = np.zeros((input.shape[0], 2))
        out = np.zeros(input.shape[0])
        # I could possibly save the 1 / ..., since it's logged it's just - log (but the other half of the probs is an issue)
@@ -97,17 +100,26 @@ class Dynamic_GLM(GibbsSampling):

    # Gibbs sampling
    def resample(self, data=[]):
+        """
+        Resampling of dynamic logistic random variables.
+        We follow the resampling scheme of Windle: "Efficient Data Augmentation in Dynamic Models for Binary and Count Data".
+        This makes use of the forward filter backwards sample algorithm. Which uses Kalman filtering, for which we use Anderson & Moore 1979
+
+        Usually you just get one type of observation per timestep (one set of features), but we have a number of different feature sets per timestep. Thus,
+        we invent more timesteps, but we don't allow the regressors to change from these steps to steps, effectively making the observations occur at the same
+        time.
+        """
        # TODO: Clean up this mess, I always have to call delete_obs_data because of all the saved shit!
        self.psi_diff_saves = []
        summary_statistics, all_times = self._get_statistics(data)
-        types, pseudo_counts, counts = summary_statistics
+        types, pseudo_counts, counts = summary_statistics  # pseudo_counts are kappa_t

        # if state is never active, resample from prior, but without dynamic change
        if len(counts) == 0:
            self.weights = np.tile(np.random.multivariate_normal(mean=self.x_0, cov=self.P_0), (self.T, 1))
            return

-        """compute Kalman filter parameters, also sort out which weight vector goes to which timepoint"""
+        # Sort out which weight vector goes to which timepoint
        timepoint_map = {}
        total_types = 0
        actual_obs_count = 0
@@ -128,31 +140,42 @@ class Dynamic_GLM(GibbsSampling):
            timepoint_map[t] = total_types - 1
            prev_t = t

+        # We introduce pseudo-variance steps: we don't want change for the fake times, so only specific pseudo-Q's will be non-zero
        self.pseudo_Q = np.zeros((total_types, self.n_regressors, self.n_regressors))
        # TODO: is it okay to cut off last timepoint here?
        for k in range(self.T):
            if k in timepoint_map:
-                self.pseudo_Q[timepoint_map[k]] = self.Q[k]  # for every timepoint, map it's variance onto the pseudo_Q
+                self.pseudo_Q[timepoint_map[k]] = self.Q[k]  # for every real timepoint, map it's variance onto the pseudo_Q

-        """sample pseudo obs"""
-        temp = np.empty(actual_obs_count)
+        """Prepare for pseudo-observation sampling, collect all the psi's"""
        psis = np.empty(actual_obs_count)
-        psi_count = 0
+        psi_counter = 0
        predictors = []
        for type, time in zip(types, all_times):
            for t in type:
-                psis[psi_count] = np.sum(self.weights[time] * t)
+                # psi (for a specific observation type t) is weights at time multiplied with the predictors t, i.e. the log odds
+                psis[psi_counter] = np.sum(self.weights[time] * t)
                predictors.append(t)
-                psi_count += 1
+                psi_counter += 1

-        ppgs.pgdrawv(np.concatenate(counts).astype(float), psis, temp)
+        # Here we use the pg package to draw augmenting variable from a polya-gamma distribution
+        # Windle calls the counts b_t = a_t + d_t, just the total number of times a psi has been seen. omega is the array in which our draws will be saved
+        omega = np.empty(actual_obs_count)
+        ppgs.pgdrawv(np.concatenate(counts).astype(float), psis, omega)
+
+        # R is the variance on the observations in Kalman filtering. According to Windle, that is 1 / omega here
        self.R = np.zeros(total_types)
        mask = np.ones(total_types, dtype=np.bool)
-        mask[fake_times] = False
-        self.R[mask] = 1 / temp
+        mask[fake_times] = False  # during fake times, we make no observations, so no variance
+        self.R[mask] = 1 / omega
+
+        # Create pseudo-obs. They can be treated as if they came from N(psi, 1 / omega)
        self.pseudo_obs = np.zeros(total_types)
-        self.pseudo_obs[mask] = np.concatenate(pseudo_counts) / temp
+        self.pseudo_obs[mask] = np.concatenate(pseudo_counts) / omega  # Windle: pseudo-data points are z_t = kappa_t / omega_t
        self.pseudo_obs = self.pseudo_obs.reshape(total_types, 1)
+
+        # H is the matrix that transforms the latents into the (means of the) observations
+        # in our case, these are the features, they multiply with the regressors to give us the means that we noisily observe
        self.H = np.zeros((total_types, self.n_regressors, 1))
        self.H[mask] = np.array(predictors).reshape(actual_obs_count, self.n_regressors, 1)

@@ -161,38 +184,48 @@ class Dynamic_GLM(GibbsSampling):
        self.compute_sigmas(total_types)
        self.compute_means(total_types)

-        """sample states"""
+        """Sample states
+        We follow Apendix 1 from "On Gibbs Sampling for State Space Models" by Carter & Kohn"""
        self.weights.fill(0)
        pseudo_weights = np.empty((total_types, self.n_regressors))
+        # the first weight can be sampled directly, from the end of the Kalman inference process
        pseudo_weights[total_types - 1] = np.random.multivariate_normal(self.x_hat_k[total_types - 1], self.sigma_k[total_types - 1])

-        normals = np.random.standard_normal((total_types - 1, self.n_regressors))
+        # the other weights now depend on the weight sampled just after them, we need to update the Kalman estimates
+        normals = np.random.standard_normal((total_types - 1, self.n_regressors))  # pregenerate some normals rvs to use later
        for k in range(total_types - 2, -1, -1):  # normally -1, but we already did first sampling
-            if np.all(self.pseudo_Q[k] == 0):
+            if np.all(self.pseudo_Q[k] == 0): # if this is a fake time, no change is allowed, just copy the weight
                pseudo_weights[k] = pseudo_weights[k + 1]
            else:
+                # start with the Kalman filter inferences
                updated_x = self.x_hat_k[k].copy()  # not sure whether copy is necessary here
                updated_sigma = self.sigma_k[k].copy()

+                # then update them with the subsequent weights
                for m in range(self.n_regressors):
+                    # these are the Carter & Kohn equations, making use of the fact that F is identity, therefore x^tilde is x
+                    # and delta_i is the variance Q
                    epsilon = pseudo_weights[k + 1, m] - updated_x[m]
                    state_R = updated_sigma[m, m] + self.pseudo_Q[k, m, m]

-                    updated_x += updated_sigma[:, m] * epsilon / state_R  # I don't think it's important, but probs we need the first column
+                    updated_x += updated_sigma[:, m] * epsilon / state_R
                    updated_sigma -= updated_sigma.dot(np.outer(self.identity[m], self.identity[m])).dot(updated_sigma) / state_R

-                pseudo_weights[k] = local_multivariate_normal_draw(updated_x, updated_sigma, normals[k])
+                pseudo_weights[k] = local_multivariate_normal_draw(updated_x, updated_sigma, normals[k])  # transform our normal according to computed mean and var

+        # put the pseudo_weights where they belong
        for k in range(self.T):
            if k in timepoint_map:
                self.weights[k] = pseudo_weights[timepoint_map[k]]

-        """Sample before and after active times too"""
+        # Sample before and after active times too
+        # fill weights up before first determined weight, at least until the jumplimit is reached (in principle this would need to be shrunk by P_0)
        for k in range(all_times[0] - 1, -1, -1):
            if k > all_times[0] - self.jumplimit - 1:
                self.weights[k] = self.weights[k + 1] + np.random.multivariate_normal(self.noise_mean, self.Q[k])
            else:
                self.weights[k] = self.weights[k + 1]
+        # fill weights up after last determined weights, at least until jumplimit
        for k in range(all_times[-1] + 1, self.T):
            if k < min(all_times[-1] + 1 + self.jumplimit, self.T):
                self.weights[k] = self.weights[k - 1] + np.random.multivariate_normal(self.noise_mean, self.Q[k])
@@ -204,7 +237,11 @@ class Dynamic_GLM(GibbsSampling):
        # self.psi_diff_saves = np.concatenate(self.psi_diff_saves)

    def _get_statistics(self, data):
-        # TODO: improve
+        """For the data of a list of sessions, we need to know:
+        which unique features did the state encounter (types)
+        what is their corresponding pseudo-observation (pseudo_counts)
+        how often did they occur (counts)
+        """
        summary_statistics = [[], [], []]
        times = []
        if isinstance(data, np.ndarray):
@@ -219,7 +256,8 @@ class Dynamic_GLM(GibbsSampling):
                    pseudo_counts = np.zeros(len(types))
                    for j, c in enumerate(counts):
                        mask = inverses == j
-                        pseudo_counts[j] = np.sum(responses[mask]) - c / 2
+                        # find out how the animals answered on these trials, and turn it into a pseudo-observation
+                        pseudo_counts[j] = np.sum(responses[mask]) - c / 2  # already turn this into the desired format: kappa_t = a_t -b_t / 2
                    summary_statistics[0].append(types)
                    summary_statistics[1].append(pseudo_counts)
                    summary_statistics[2].append(counts)
@@ -231,28 +269,33 @@ class Dynamic_GLM(GibbsSampling):
        """Sigmas can be precomputed (without z), we do this here."""
        # We rely on the fact that H.T.dot(sigma).dot(H) is just a number, no matrix inversion needed
        # furthermore we use the fact that many matrices are identities, namely F and G
+        # We follow Anderson & Moore 1979, p.44
        self.sigma_k = []  # we have to reset this for repeating this calculation later for the resampling (R changes)
        self.sigma_k_k_minus = [self.P_0]
        self.gain_save = []
        for k in range(T):
-            if self.R[k] == 0:
-                self.gain_save.append(None)
+            if self.R[k] == 0:  # these are fake times, no observations were made but we allow change as per the jumplimit
+                self.gain_save.append(None)  # need a placeholder
                self.sigma_k.append(self.sigma_k_k_minus[k])
-                self.sigma_k_k_minus.append(self.sigma_k[k] + self.pseudo_Q[k])
+                self.sigma_k_k_minus.append(self.sigma_k[k] + self.pseudo_Q[k])  # the amount of allowed change just adds up if no obs
            else:
                sigma, H = self.sigma_k_k_minus[k], self.H[k]  # we will need this a lot, so shorten it
-                gain = sigma.dot(H).dot(1 / (H.T.dot(sigma).dot(H) + self.R[k]))
-                self.gain_save.append(gain)
+                gain = sigma.dot(H).dot(1 / (H.T.dot(sigma).dot(H) + self.R[k])) # matrix K, elsewhere called gain
+                self.gain_save.append(gain)  # the gain matrix is useful for compute_means as well, save it
+
+                # the two main equations, from p.44:
                self.sigma_k.append(sigma - gain.dot(H.T).dot(sigma))
                self.sigma_k_k_minus.append(self.sigma_k[k] + self.pseudo_Q[k])

    def compute_means(self, T):
        """Compute the means, the estimates of the states.
-        Used to also contain self.x_hat_k_k_minus, but it's not necessary for our setup"""
+        Used to also contain self.x_hat_k_k_minus, but it's not necessary for our setup
+        (Especially (if not generally), with F=1, x_hat_k_k_minus is just equal to x_hat_k, see also p.40)"""
+        # We follow Anderson & Moore 1979, p.44
        self.x_hat_k = [self.x_0]  # we have to reset this for repeating this calculation later for the resampling
        for k in range(T):  # this will leave out last state which doesn't have observation
            if self.gain_save[k] is None:
-                self.x_hat_k.append(self.x_hat_k[k])
+                self.x_hat_k.append(self.x_hat_k[k])  # mean stays unchanged if no observations
            else:
                x, H = self.x_hat_k[k], self.H[k]  # we will need this a lot, so shorten it
                self.x_hat_k.append(x + self.gain_save[k].dot(self.pseudo_obs[k] - H.T.dot(x)))
@@ -260,6 +303,7 @@ class Dynamic_GLM(GibbsSampling):
        self.x_hat_k.pop(0)  # remove initialisation element from list

    def num_parameters(self):
+        # Compute WAIC in principle
        return self.weights.size

    ### Max likelihood
No results found