diff --git a/pybasicbayes/distributions/dynamic_glm.py b/pybasicbayes/distributions/dynamic_glm.py index 17b79a5395c4788e75a847bdd96ac60559c0f754..94a11c745f30e10825e2ccb5e77dc31ecef642ab 100644 --- a/pybasicbayes/distributions/dynamic_glm.py +++ b/pybasicbayes/distributions/dynamic_glm.py @@ -9,7 +9,7 @@ __all__ = ['Dynamic_GLM'] def local_multivariate_normal_draw(x, sigma, normal): """ - Function to combine pre-drawn Normals (normal) with the desired mean x and variance sigma + Function to combine pre-drawn Normals (normal) with the desired mean x and variance sigma (cause calls to multivariate normal are too costly) Cholesky doesn't like 0 cov matrix, but we want it. This might need changing if in practice we see plently of 0 matrices @@ -23,7 +23,7 @@ def local_multivariate_normal_draw(x, sigma, normal): print("Weird covariance matrix") quit() - +# fix seed of PyPolyaGamma, otherwise results aren't reproducible ppgsseed = 4 if ppgsseed == 4: print("Using default seed") @@ -52,7 +52,7 @@ class Dynamic_GLM(GibbsSampling): self.T = T self.jumplimit = jumplimit self.x_0 = prior_mean - self.P_0, self.Q = P_0, Q + self.P_0, self.Q = P_0, Q # initial variance of regressors P_0, and variance for timesteps Q (can have different variances, we have it fixed) self.psi_diff_saves = [] # this can be used to resample the variance, but is currently unused self.noise_mean = np.zeros(self.n_regressors) # save this, so as to not keep creating it self.identity = np.eye(self.n_regressors) # not really needed, but kinda useful for state sampling @@ -82,7 +82,7 @@ class Dynamic_GLM(GibbsSampling): def log_likelihood(self, input, timepoint): """ - Given input from a single session (a matrix containing regressors and responses) and the timepoint information from that session, return log-likelihoods for observations. + Given input from a single session (a matrix containing features and responses) and the timepoint information from that session, return log-likelihoods for observations. """ predictors, responses = input[:, :-1], input[:, -1] nans = np.isnan(responses) # can come from cross-validation @@ -104,18 +104,22 @@ class Dynamic_GLM(GibbsSampling): Resampling of dynamic logistic random variables. We follow the resampling scheme of Windle: "Efficient Data Augmentation in Dynamic Models for Binary and Count Data". This makes use of the forward filter backwards sample algorithm. Which uses Kalman filtering, for which we use Anderson & Moore 1979 + + Usually you just get one type of observation per timestep (one set of features), but we have a number of different feature sets per timestep. Thus, + we invent more timesteps, but we don't allow the regressors to change from these steps to steps, effectively making the observations occur at the same + time. """ # TODO: Clean up this mess, I always have to call delete_obs_data because of all the saved shit! self.psi_diff_saves = [] summary_statistics, all_times = self._get_statistics(data) - types, pseudo_counts, counts = summary_statistics + types, pseudo_counts, counts = summary_statistics # pseudo_counts are kappa_t # if state is never active, resample from prior, but without dynamic change if len(counts) == 0: self.weights = np.tile(np.random.multivariate_normal(mean=self.x_0, cov=self.P_0), (self.T, 1)) return - """compute Kalman filter parameters, also sort out which weight vector goes to which timepoint""" + # Sort out which weight vector goes to which timepoint timepoint_map = {} total_types = 0 actual_obs_count = 0 @@ -136,33 +140,42 @@ class Dynamic_GLM(GibbsSampling): timepoint_map[t] = total_types - 1 prev_t = t + # We introduce pseudo-variance steps: we don't want change for the fake times, so only specific pseudo-Q's will be non-zero self.pseudo_Q = np.zeros((total_types, self.n_regressors, self.n_regressors)) # TODO: is it okay to cut off last timepoint here? for k in range(self.T): if k in timepoint_map: - self.pseudo_Q[timepoint_map[k]] = self.Q[k] # for every timepoint, map it's variance onto the pseudo_Q + self.pseudo_Q[timepoint_map[k]] = self.Q[k] # for every real timepoint, map it's variance onto the pseudo_Q - """Prepare for pseudo-observation sampling""" - temp = np.empty(actual_obs_count) + """Prepare for pseudo-observation sampling, collect all the psi's""" psis = np.empty(actual_obs_count) psi_counter = 0 predictors = [] for type, time in zip(types, all_times): for t in type: + # psi (for a specific observation type t) is weights at time multiplied with the predictors t, i.e. the log odds psis[psi_counter] = np.sum(self.weights[time] * t) predictors.append(t) psi_counter += 1 # Here we use the pg package to draw augmenting variable from a polya-gamma distribution - # Windle calls the counts b_t = a_t + d_t, just the total number of times a psi has been seen. temp is the array in which our draws will be saved - ppgs.pgdrawv(np.concatenate(counts).astype(float), psis, temp) + # Windle calls the counts b_t = a_t + d_t, just the total number of times a psi has been seen. omega is the array in which our draws will be saved + omega = np.empty(actual_obs_count) + ppgs.pgdrawv(np.concatenate(counts).astype(float), psis, omega) + + # R is the variance on the observations in Kalman filtering. According to Windle, that is 1 / omega here self.R = np.zeros(total_types) mask = np.ones(total_types, dtype=np.bool) - mask[fake_times] = False - self.R[mask] = 1 / temp + mask[fake_times] = False # during fake times, we make no observations, so no variance + self.R[mask] = 1 / omega + + # Create pseudo-obs. They can be treated as if they came from N(psi, 1 / omega) self.pseudo_obs = np.zeros(total_types) - self.pseudo_obs[mask] = np.concatenate(pseudo_counts) / temp + self.pseudo_obs[mask] = np.concatenate(pseudo_counts) / omega # Windle: pseudo-data points are z_t = kappa_t / omega_t self.pseudo_obs = self.pseudo_obs.reshape(total_types, 1) + + # H is the matrix that transforms the latents into the (means of the) observations + # in our case, these are the features, they multiply with the regressors to give us the means that we noisily observe self.H = np.zeros((total_types, self.n_regressors, 1)) self.H[mask] = np.array(predictors).reshape(actual_obs_count, self.n_regressors, 1) @@ -171,38 +184,48 @@ class Dynamic_GLM(GibbsSampling): self.compute_sigmas(total_types) self.compute_means(total_types) - """sample states""" + """Sample states + We follow Apendix 1 from "On Gibbs Sampling for State Space Models" by Carter & Kohn""" self.weights.fill(0) pseudo_weights = np.empty((total_types, self.n_regressors)) + # the first weight can be sampled directly, from the end of the Kalman inference process pseudo_weights[total_types - 1] = np.random.multivariate_normal(self.x_hat_k[total_types - 1], self.sigma_k[total_types - 1]) - normals = np.random.standard_normal((total_types - 1, self.n_regressors)) + # the other weights now depend on the weight sampled just after them, we need to update the Kalman estimates + normals = np.random.standard_normal((total_types - 1, self.n_regressors)) # pregenerate some normals rvs to use later for k in range(total_types - 2, -1, -1): # normally -1, but we already did first sampling - if np.all(self.pseudo_Q[k] == 0): + if np.all(self.pseudo_Q[k] == 0): # if this is a fake time, no change is allowed, just copy the weight pseudo_weights[k] = pseudo_weights[k + 1] else: + # start with the Kalman filter inferences updated_x = self.x_hat_k[k].copy() # not sure whether copy is necessary here updated_sigma = self.sigma_k[k].copy() + # then update them with the subsequent weights for m in range(self.n_regressors): + # these are the Carter & Kohn equations, making use of the fact that F is identity, therefore x^tilde is x + # and delta_i is the variance Q epsilon = pseudo_weights[k + 1, m] - updated_x[m] state_R = updated_sigma[m, m] + self.pseudo_Q[k, m, m] - updated_x += updated_sigma[:, m] * epsilon / state_R # I don't think it's important, but probs we need the first column + updated_x += updated_sigma[:, m] * epsilon / state_R updated_sigma -= updated_sigma.dot(np.outer(self.identity[m], self.identity[m])).dot(updated_sigma) / state_R - pseudo_weights[k] = local_multivariate_normal_draw(updated_x, updated_sigma, normals[k]) + pseudo_weights[k] = local_multivariate_normal_draw(updated_x, updated_sigma, normals[k]) # transform our normal according to computed mean and var + # put the pseudo_weights where they belong for k in range(self.T): if k in timepoint_map: self.weights[k] = pseudo_weights[timepoint_map[k]] - """Sample before and after active times too""" + # Sample before and after active times too + # fill weights up before first determined weight, at least until the jumplimit is reached (in principle this would need to be shrunk by P_0) for k in range(all_times[0] - 1, -1, -1): if k > all_times[0] - self.jumplimit - 1: self.weights[k] = self.weights[k + 1] + np.random.multivariate_normal(self.noise_mean, self.Q[k]) else: self.weights[k] = self.weights[k + 1] + # fill weights up after last determined weights, at least until jumplimit for k in range(all_times[-1] + 1, self.T): if k < min(all_times[-1] + 1 + self.jumplimit, self.T): self.weights[k] = self.weights[k - 1] + np.random.multivariate_normal(self.noise_mean, self.Q[k]) @@ -214,7 +237,11 @@ class Dynamic_GLM(GibbsSampling): # self.psi_diff_saves = np.concatenate(self.psi_diff_saves) def _get_statistics(self, data): - # TODO: improve + """For the data of a list of sessions, we need to know: + which unique features did the state encounter (types) + what is their corresponding pseudo-observation (pseudo_counts) + how often did they occur (counts) + """ summary_statistics = [[], [], []] times = [] if isinstance(data, np.ndarray): @@ -229,7 +256,8 @@ class Dynamic_GLM(GibbsSampling): pseudo_counts = np.zeros(len(types)) for j, c in enumerate(counts): mask = inverses == j - pseudo_counts[j] = np.sum(responses[mask]) - c / 2 + # find out how the animals answered on these trials, and turn it into a pseudo-observation + pseudo_counts[j] = np.sum(responses[mask]) - c / 2 # already turn this into the desired format: kappa_t = a_t -b_t / 2 summary_statistics[0].append(types) summary_statistics[1].append(pseudo_counts) summary_statistics[2].append(counts) @@ -241,29 +269,33 @@ class Dynamic_GLM(GibbsSampling): """Sigmas can be precomputed (without z), we do this here.""" # We rely on the fact that H.T.dot(sigma).dot(H) is just a number, no matrix inversion needed # furthermore we use the fact that many matrices are identities, namely F and G - # We follow Anderson & Moore 1979, equations are listed where applicable + # We follow Anderson & Moore 1979, p.44 self.sigma_k = [] # we have to reset this for repeating this calculation later for the resampling (R changes) - self.sigma_k_k_minus = [self.P_0] # (1.10) + self.sigma_k_k_minus = [self.P_0] self.gain_save = [] for k in range(T): - if self.R[k] == 0: - self.gain_save.append(None) + if self.R[k] == 0: # these are fake times, no observations were made but we allow change as per the jumplimit + self.gain_save.append(None) # need a placeholder self.sigma_k.append(self.sigma_k_k_minus[k]) - self.sigma_k_k_minus.append(self.sigma_k[k] + self.pseudo_Q[k]) + self.sigma_k_k_minus.append(self.sigma_k[k] + self.pseudo_Q[k]) # the amount of allowed change just adds up if no obs else: sigma, H = self.sigma_k_k_minus[k], self.H[k] # we will need this a lot, so shorten it - gain = sigma.dot(H).dot(1 / (H.T.dot(sigma).dot(H) + self.R[k])) - self.gain_save.append(gain) + gain = sigma.dot(H).dot(1 / (H.T.dot(sigma).dot(H) + self.R[k])) # matrix K, elsewhere called gain + self.gain_save.append(gain) # the gain matrix is useful for compute_means as well, save it + + # the two main equations, from p.44: self.sigma_k.append(sigma - gain.dot(H.T).dot(sigma)) self.sigma_k_k_minus.append(self.sigma_k[k] + self.pseudo_Q[k]) def compute_means(self, T): """Compute the means, the estimates of the states. - Used to also contain self.x_hat_k_k_minus, but it's not necessary for our setup""" + Used to also contain self.x_hat_k_k_minus, but it's not necessary for our setup + (Especially (if not generally), with F=1, x_hat_k_k_minus is just equal to x_hat_k, see also p.40)""" + # We follow Anderson & Moore 1979, p.44 self.x_hat_k = [self.x_0] # we have to reset this for repeating this calculation later for the resampling for k in range(T): # this will leave out last state which doesn't have observation if self.gain_save[k] is None: - self.x_hat_k.append(self.x_hat_k[k]) + self.x_hat_k.append(self.x_hat_k[k]) # mean stays unchanged if no observations else: x, H = self.x_hat_k[k], self.H[k] # we will need this a lot, so shorten it self.x_hat_k.append(x + self.gain_save[k].dot(self.pseudo_obs[k] - H.T.dot(x))) @@ -271,6 +303,7 @@ class Dynamic_GLM(GibbsSampling): self.x_hat_k.pop(0) # remove initialisation element from list def num_parameters(self): + # Compute WAIC in principle return self.weights.size ### Max likelihood