diff --git a/pybasicbayes/distributions/dynamic_glm.py b/pybasicbayes/distributions/dynamic_glm.py
index 17b79a5395c4788e75a847bdd96ac60559c0f754..94a11c745f30e10825e2ccb5e77dc31ecef642ab 100644
--- a/pybasicbayes/distributions/dynamic_glm.py
+++ b/pybasicbayes/distributions/dynamic_glm.py
@@ -9,7 +9,7 @@ __all__ = ['Dynamic_GLM']
 
 def local_multivariate_normal_draw(x, sigma, normal):
     """
-    Function to combine pre-drawn Normals (normal) with the desired mean x and variance sigma
+    Function to combine pre-drawn Normals (normal) with the desired mean x and variance sigma (cause calls to multivariate normal are too costly)
     Cholesky doesn't like 0 cov matrix, but we want it.
 
     This might need changing if in practice we see plently of 0 matrices
@@ -23,7 +23,7 @@ def local_multivariate_normal_draw(x, sigma, normal):
             print("Weird covariance matrix")
             quit()
 
-
+# fix seed of PyPolyaGamma, otherwise results aren't reproducible
 ppgsseed = 4
 if ppgsseed == 4:
     print("Using default seed")
@@ -52,7 +52,7 @@ class Dynamic_GLM(GibbsSampling):
         self.T = T
         self.jumplimit = jumplimit
         self.x_0 = prior_mean
-        self.P_0, self.Q = P_0, Q
+        self.P_0, self.Q = P_0, Q  # initial variance of regressors P_0, and variance for timesteps Q (can have different variances, we have it fixed)
         self.psi_diff_saves = []  # this can be used to resample the variance, but is currently unused
         self.noise_mean = np.zeros(self.n_regressors)  # save this, so as to not keep creating it
         self.identity = np.eye(self.n_regressors)  # not really needed, but kinda useful for state sampling
@@ -82,7 +82,7 @@ class Dynamic_GLM(GibbsSampling):
 
     def log_likelihood(self, input, timepoint):
         """
-        Given input from a single session (a matrix containing regressors and responses) and the timepoint information from that session, return log-likelihoods for observations.
+        Given input from a single session (a matrix containing features and responses) and the timepoint information from that session, return log-likelihoods for observations.
         """
         predictors, responses = input[:, :-1], input[:, -1]
         nans = np.isnan(responses)  # can come from cross-validation
@@ -104,18 +104,22 @@ class Dynamic_GLM(GibbsSampling):
         Resampling of dynamic logistic random variables.
         We follow the resampling scheme of Windle: "Efficient Data Augmentation in Dynamic Models for Binary and Count Data".
         This makes use of the forward filter backwards sample algorithm. Which uses Kalman filtering, for which we use Anderson & Moore 1979
+
+        Usually you just get one type of observation per timestep (one set of features), but we have a number of different feature sets per timestep. Thus,
+        we invent more timesteps, but we don't allow the regressors to change from these steps to steps, effectively making the observations occur at the same
+        time.
         """
         # TODO: Clean up this mess, I always have to call delete_obs_data because of all the saved shit!
         self.psi_diff_saves = []
         summary_statistics, all_times = self._get_statistics(data)
-        types, pseudo_counts, counts = summary_statistics
+        types, pseudo_counts, counts = summary_statistics  # pseudo_counts are kappa_t
 
         # if state is never active, resample from prior, but without dynamic change
         if len(counts) == 0:
             self.weights = np.tile(np.random.multivariate_normal(mean=self.x_0, cov=self.P_0), (self.T, 1))
             return
 
-        """compute Kalman filter parameters, also sort out which weight vector goes to which timepoint"""
+        # Sort out which weight vector goes to which timepoint
         timepoint_map = {}
         total_types = 0
         actual_obs_count = 0
@@ -136,33 +140,42 @@ class Dynamic_GLM(GibbsSampling):
             timepoint_map[t] = total_types - 1
             prev_t = t
 
+        # We introduce pseudo-variance steps: we don't want change for the fake times, so only specific pseudo-Q's will be non-zero
         self.pseudo_Q = np.zeros((total_types, self.n_regressors, self.n_regressors))
         # TODO: is it okay to cut off last timepoint here?
         for k in range(self.T):
             if k in timepoint_map:
-                self.pseudo_Q[timepoint_map[k]] = self.Q[k]  # for every timepoint, map it's variance onto the pseudo_Q
+                self.pseudo_Q[timepoint_map[k]] = self.Q[k]  # for every real timepoint, map it's variance onto the pseudo_Q
 
-        """Prepare for pseudo-observation sampling"""
-        temp = np.empty(actual_obs_count)
+        """Prepare for pseudo-observation sampling, collect all the psi's"""
         psis = np.empty(actual_obs_count)
         psi_counter = 0
         predictors = []
         for type, time in zip(types, all_times):
             for t in type:
+                # psi (for a specific observation type t) is weights at time multiplied with the predictors t, i.e. the log odds
                 psis[psi_counter] = np.sum(self.weights[time] * t)
                 predictors.append(t)
                 psi_counter += 1
 
         # Here we use the pg package to draw augmenting variable from a polya-gamma distribution
-        # Windle calls the counts b_t = a_t + d_t, just the total number of times a psi has been seen. temp is the array in which our draws will be saved
-        ppgs.pgdrawv(np.concatenate(counts).astype(float), psis, temp)
+        # Windle calls the counts b_t = a_t + d_t, just the total number of times a psi has been seen. omega is the array in which our draws will be saved
+        omega = np.empty(actual_obs_count)
+        ppgs.pgdrawv(np.concatenate(counts).astype(float), psis, omega)
+
+        # R is the variance on the observations in Kalman filtering. According to Windle, that is 1 / omega here
         self.R = np.zeros(total_types)
         mask = np.ones(total_types, dtype=np.bool)
-        mask[fake_times] = False
-        self.R[mask] = 1 / temp
+        mask[fake_times] = False  # during fake times, we make no observations, so no variance
+        self.R[mask] = 1 / omega
+
+        # Create pseudo-obs. They can be treated as if they came from N(psi, 1 / omega)
         self.pseudo_obs = np.zeros(total_types)
-        self.pseudo_obs[mask] = np.concatenate(pseudo_counts) / temp
+        self.pseudo_obs[mask] = np.concatenate(pseudo_counts) / omega  # Windle: pseudo-data points are z_t = kappa_t / omega_t
         self.pseudo_obs = self.pseudo_obs.reshape(total_types, 1)
+
+        # H is the matrix that transforms the latents into the (means of the) observations
+        # in our case, these are the features, they multiply with the regressors to give us the means that we noisily observe
         self.H = np.zeros((total_types, self.n_regressors, 1))
         self.H[mask] = np.array(predictors).reshape(actual_obs_count, self.n_regressors, 1)
 
@@ -171,38 +184,48 @@ class Dynamic_GLM(GibbsSampling):
         self.compute_sigmas(total_types)
         self.compute_means(total_types)
 
-        """sample states"""
+        """Sample states
+        We follow Apendix 1 from "On Gibbs Sampling for State Space Models" by Carter & Kohn"""
         self.weights.fill(0)
         pseudo_weights = np.empty((total_types, self.n_regressors))
+        # the first weight can be sampled directly, from the end of the Kalman inference process
         pseudo_weights[total_types - 1] = np.random.multivariate_normal(self.x_hat_k[total_types - 1], self.sigma_k[total_types - 1])
 
-        normals = np.random.standard_normal((total_types - 1, self.n_regressors))
+        # the other weights now depend on the weight sampled just after them, we need to update the Kalman estimates
+        normals = np.random.standard_normal((total_types - 1, self.n_regressors))  # pregenerate some normals rvs to use later
         for k in range(total_types - 2, -1, -1):  # normally -1, but we already did first sampling
-            if np.all(self.pseudo_Q[k] == 0):
+            if np.all(self.pseudo_Q[k] == 0): # if this is a fake time, no change is allowed, just copy the weight
                 pseudo_weights[k] = pseudo_weights[k + 1]
             else:
+                # start with the Kalman filter inferences
                 updated_x = self.x_hat_k[k].copy()  # not sure whether copy is necessary here
                 updated_sigma = self.sigma_k[k].copy()
 
+                # then update them with the subsequent weights
                 for m in range(self.n_regressors):
+                    # these are the Carter & Kohn equations, making use of the fact that F is identity, therefore x^tilde is x
+                    # and delta_i is the variance Q
                     epsilon = pseudo_weights[k + 1, m] - updated_x[m]
                     state_R = updated_sigma[m, m] + self.pseudo_Q[k, m, m]
 
-                    updated_x += updated_sigma[:, m] * epsilon / state_R  # I don't think it's important, but probs we need the first column
+                    updated_x += updated_sigma[:, m] * epsilon / state_R
                     updated_sigma -= updated_sigma.dot(np.outer(self.identity[m], self.identity[m])).dot(updated_sigma) / state_R
 
-                pseudo_weights[k] = local_multivariate_normal_draw(updated_x, updated_sigma, normals[k])
+                pseudo_weights[k] = local_multivariate_normal_draw(updated_x, updated_sigma, normals[k])  # transform our normal according to computed mean and var
 
+        # put the pseudo_weights where they belong
         for k in range(self.T):
             if k in timepoint_map:
                 self.weights[k] = pseudo_weights[timepoint_map[k]]
 
-        """Sample before and after active times too"""
+        # Sample before and after active times too
+        # fill weights up before first determined weight, at least until the jumplimit is reached (in principle this would need to be shrunk by P_0)
         for k in range(all_times[0] - 1, -1, -1):
             if k > all_times[0] - self.jumplimit - 1:
                 self.weights[k] = self.weights[k + 1] + np.random.multivariate_normal(self.noise_mean, self.Q[k])
             else:
                 self.weights[k] = self.weights[k + 1]
+        # fill weights up after last determined weights, at least until jumplimit
         for k in range(all_times[-1] + 1, self.T):
             if k < min(all_times[-1] + 1 + self.jumplimit, self.T):
                 self.weights[k] = self.weights[k - 1] + np.random.multivariate_normal(self.noise_mean, self.Q[k])
@@ -214,7 +237,11 @@ class Dynamic_GLM(GibbsSampling):
         # self.psi_diff_saves = np.concatenate(self.psi_diff_saves)
 
     def _get_statistics(self, data):
-        # TODO: improve
+        """For the data of a list of sessions, we need to know:
+        which unique features did the state encounter (types)
+        what is their corresponding pseudo-observation (pseudo_counts)
+        how often did they occur (counts)
+        """
         summary_statistics = [[], [], []]
         times = []
         if isinstance(data, np.ndarray):
@@ -229,7 +256,8 @@ class Dynamic_GLM(GibbsSampling):
                     pseudo_counts = np.zeros(len(types))
                     for j, c in enumerate(counts):
                         mask = inverses == j
-                        pseudo_counts[j] = np.sum(responses[mask]) - c / 2
+                        # find out how the animals answered on these trials, and turn it into a pseudo-observation
+                        pseudo_counts[j] = np.sum(responses[mask]) - c / 2  # already turn this into the desired format: kappa_t = a_t -b_t / 2
                     summary_statistics[0].append(types)
                     summary_statistics[1].append(pseudo_counts)
                     summary_statistics[2].append(counts)
@@ -241,29 +269,33 @@ class Dynamic_GLM(GibbsSampling):
         """Sigmas can be precomputed (without z), we do this here."""
         # We rely on the fact that H.T.dot(sigma).dot(H) is just a number, no matrix inversion needed
         # furthermore we use the fact that many matrices are identities, namely F and G
-        # We follow Anderson & Moore 1979, equations are listed where applicable
+        # We follow Anderson & Moore 1979, p.44
         self.sigma_k = []  # we have to reset this for repeating this calculation later for the resampling (R changes)
-        self.sigma_k_k_minus = [self.P_0]  # (1.10)
+        self.sigma_k_k_minus = [self.P_0]
         self.gain_save = []
         for k in range(T):
-            if self.R[k] == 0:
-                self.gain_save.append(None)
+            if self.R[k] == 0:  # these are fake times, no observations were made but we allow change as per the jumplimit
+                self.gain_save.append(None)  # need a placeholder
                 self.sigma_k.append(self.sigma_k_k_minus[k])
-                self.sigma_k_k_minus.append(self.sigma_k[k] + self.pseudo_Q[k])
+                self.sigma_k_k_minus.append(self.sigma_k[k] + self.pseudo_Q[k])  # the amount of allowed change just adds up if no obs
             else:
                 sigma, H = self.sigma_k_k_minus[k], self.H[k]  # we will need this a lot, so shorten it
-                gain = sigma.dot(H).dot(1 / (H.T.dot(sigma).dot(H) + self.R[k]))
-                self.gain_save.append(gain)
+                gain = sigma.dot(H).dot(1 / (H.T.dot(sigma).dot(H) + self.R[k])) # matrix K, elsewhere called gain
+                self.gain_save.append(gain)  # the gain matrix is useful for compute_means as well, save it
+
+                # the two main equations, from p.44:
                 self.sigma_k.append(sigma - gain.dot(H.T).dot(sigma))
                 self.sigma_k_k_minus.append(self.sigma_k[k] + self.pseudo_Q[k])
 
     def compute_means(self, T):
         """Compute the means, the estimates of the states.
-        Used to also contain self.x_hat_k_k_minus, but it's not necessary for our setup"""
+        Used to also contain self.x_hat_k_k_minus, but it's not necessary for our setup
+        (Especially (if not generally), with F=1, x_hat_k_k_minus is just equal to x_hat_k, see also p.40)"""
+        # We follow Anderson & Moore 1979, p.44
         self.x_hat_k = [self.x_0]  # we have to reset this for repeating this calculation later for the resampling
         for k in range(T):  # this will leave out last state which doesn't have observation
             if self.gain_save[k] is None:
-                self.x_hat_k.append(self.x_hat_k[k])
+                self.x_hat_k.append(self.x_hat_k[k])  # mean stays unchanged if no observations
             else:
                 x, H = self.x_hat_k[k], self.H[k]  # we will need this a lot, so shorten it
                 self.x_hat_k.append(x + self.gain_save[k].dot(self.pseudo_obs[k] - H.T.dot(x)))
@@ -271,6 +303,7 @@ class Dynamic_GLM(GibbsSampling):
         self.x_hat_k.pop(0)  # remove initialisation element from list
 
     def num_parameters(self):
+        # Compute WAIC in principle
         return self.weights.size
 
     ### Max likelihood