Added Experiment 2 (Figure 4).

0ff76562 · Shashank Singh · 18872fc1 · 0ff76562 · 0ff76562 · 18872fc1
Commit 0ff76562 authored 2 years ago by Shashank Singh
--- a/README.md
+++ b/README.md
@@ -14,6 +14,12 @@ To reproduce Figure 3, run:

 This may take a few hours to run.

+To reproduce Figure 4, run:
+
+    python figure_4.py
+
+This may take a few hours to run.
+
 To reproduce Figure 5, run:

    python figure_5.py

--- a/SIR_simulation.py
+++ b/SIR_simulation.py
-import numpy as np
-from scipy.stats import multinomial
 import matplotlib.pyplot as plt
+import numpy as np
+from scipy.stats import beta, dirichlet
+
+def _sim(T=100, N=1000, S0=None, I0=10, R0=0, D0=0, beta=0.2, gamma=0.1, delta=0.01):
+  """Simulates T steps of a stochastic SIR model.
+    T: (int>=0) Number timesteps to compute forward
+    N: (int>=0) Total population
+    S0: (int>=0) Initial number of susceptible individuals
+    I0: (int>=0) Initial number of infected individuals
+    R0: (int>=0) Initial number of recovered individuals
+    D0: (int>=0) Initial number of dead individuals
+    beta: ([0, 1]-valued) Transmission rate
+    gamma: ([0, 1]-valued) Recovery rate
+    delta: ([0, 1]-valued) Death rate
+  """
+
+  if not S0:
+    S0 = N - I0 - R0 - D0

-def sim(N=1000, ts=np.linspace(0, 150, 150), S0=0, I0=10, R0=0, D0=0, beta=0.2, gamma=0.1, delta=0.01):
  S, I, R, D = S0, I0, R0, D0
  Ss = [S0]
  Is = [I0]
  Rs = [R0]
  Ds = [D0]

-  for t in ts[1:]:
+  for t in range(T):
    
    dSdt = -np.random.binomial(S, beta*I/N)
    S += dSdt
-    R, D, I = multinomial.rvs(I, [gamma, delta, 1 - (gamma + delta)])
+    R, D, I = np.random.multinomial(I, [gamma, delta, 1 - (gamma + delta)])
    I -= dSdt
    Ss.append(S)
    Is.append(I)
@@ -22,19 +37,48 @@ def sim(N=1000, ts=np.linspace(0, 150, 150), S0=0, I0=10, R0=0, D0=0, beta=0.2,

  return np.array(Ss), np.array(Is), np.array(Rs), np.array(Ds)

+class Sampler:
+
+  def __init__(self, a=2, b=8, alpha=np.array([1.0, 0.3, 6.7]), T=100, sim_len=20):
+    self.beta = beta(a, b)
+    self.dirichlet = dirichlet(alpha)
+    self.T = T
+    self.sim_len = sim_len
+    self.d_X = 4*sim_len
+
+  def _get_conditional_samples(self, Zs, n):
+    Xs = np.zeros((n, self.d_X))
+    Ys = np.zeros((n, 1))
+  
+    for i in range(n):
+      Ss, Is, Rs, Ds = _sim(beta=Zs[i, 0], gamma=Zs[i, 1], delta=Zs[i, 2], T=self.T)
+      Xs[i, :] = np.concatenate([Ss[:self.sim_len],
+                                 Is[:self.sim_len],
+                                 Rs[:self.sim_len],
+                                 Ds[:self.sim_len]])
+      Ys[i, :] = Ds[-1]
+    return Xs, Ys, Zs
+
+  def generate_conditional_samples(self, z, n):
+    """Generates n independent samples of (X, Y, Z) conditioned on Z = z."""
+
+    Zs = np.tile(z, [n, 1])
+    return self._get_conditional_samples(Zs, n)
+  
+  def generate_joint_samples(self, n):
+    """Generates n independent samples of (X, Y, Z)."""
+
+    Zs = np.concatenate([self.beta.rvs(size=(n,1)), self.dirichlet.rvs(size=n)], axis=1)
+    return self._get_conditional_samples(Zs, n)
+

 if __name__ == "__main__":
-  # Total population, N.
-  N = 1000
-  # Initial number of infected, recovered, and dead individuals, I0, R0, and D0.
-  I, R, D = 10, 0, 0
-  # Everyone else, S0, is susceptible to infection initially.
-  S = N - I - R - D
-  # Contact rate, beta, and mean recovery rate, gamma, (in 1/days).
-  beta, gamma, delta = 0.2, 0.1, 0.01
-  # A grid of time points (in days)
  ts = np.linspace(0, 150, 150)
-  Ss, Is, Rs, Ds = sim(N, ts, S, I, R, D)
+  b = beta.rvs(a=2, b=5)
+  d = dirichlet.rvs([0.9, 0.1, 9.0])
+  gamma, delta = d[0, 0], d[0, 1]
+
+  Ss, Is, Rs, Ds = _sim(len(ts)-1, beta=b, gamma=gamma, delta=delta)
  
  # Plot the data on three separate curves for S(t), I(t) and R(t)
  fig = plt.figure(facecolor='w')
@@ -45,9 +89,6 @@ if __name__ == "__main__":
  ax.plot(ts, Ds/N, alpha=0.5, lw=2, label='Dead')
  ax.set_xlabel('Time /days')
  ax.set_ylabel('Number (1000s)')
-  ax.set_ylim(0,1.2)
-  ax.yaxis.set_tick_params(length=0)
-  ax.xaxis.set_tick_params(length=0)
  ax.grid(visible=True, which='major', c='w', lw=2, ls='-')
  legend = ax.legend()
  legend.get_frame().set_alpha(0.5)

--- a/experiment3.py
+++ b/experiment3.py
-from SIR_simulation import sim
-
-Ss, Is, Rs, Ds = sim()
--- a/figure_4.py
+++ b/figure_4.py
+import matplotlib.pyplot as plt
+import numpy as np
+from tqdm import tqdm
+
+from knn_regressors import passive_knn_regressor, passive_kernel_regressor, active_knn_regressor
+from SIR_simulation import Sampler
+
+plt.rc('font', size=22)  # Increase matplotlib default font size
+np.random.seed(0)  # Fix random seed for reproducibility
+
+num_replicates = 1**10 # Number of IID replications of experiment
+ns = np.logspace(4, 12, 9, base=2., dtype=int)
+
+sim_len = 20
+sampler = Sampler(sim_len=sim_len)
+
+passive_knn_errors = np.zeros((num_replicates, len(ns)))
+passive_errors = np.zeros((num_replicates, len(ns)))
+active_errors = np.zeros((num_replicates, len(ns)))
+for replicate in tqdm(range(num_replicates)):
+  for n_idx, n in enumerate(ns):
+    # Generate some hypothetical "ground truth" from the simulator
+    Xs_ground_truth, Ys_ground_truth, Zs_ground_truth = sampler.generate_joint_samples(1)
+
+    # Simulated data for passive estimators
+    Xs, Ys, Zs = sampler.generate_joint_samples(n)
+    passive_errors[replicate, n_idx] = passive_kernel_regressor(Xs, Ys, Zs, Xs_ground_truth, method='cv') - Ys_ground_truth
+    
+    active_estimates, Xs_active, Ys_active, Zs_active = active_knn_regressor(sampler, Xs_ground_truth, n, method='cv')
+    active_errors[replicate, n_idx] = active_estimates - Ys_ground_truth
+
+def _mae(errors):
+  return np.abs(errors).mean(axis=0)
+def _ste(errors):
+  return np.abs(errors).std(axis=0)/(num_replicates**(1/2))
+
+plt.figure(figsize=(12, 12))
+plt.errorbar(ns, _mae(passive_errors), _ste(passive_errors), ls='--', c='orange', marker='o', markersize=10, lw=2, label='Passive CV')
+plt.errorbar(ns, _mae(active_errors), _ste(active_errors), ls='--', c='r', marker='^', markersize=10, lw=2, label='Active CV')
+plt.legend()
+plt.gca().set_xscale('log', base=2.0)
+plt.gca().set_yscale('log')
+plt.grid(visible=True)
+plt.ylabel('Mean Absolute Error (MAE)')
+plt.xlabel('Sample Size ($n$)')
+plt.show()