|
|
$$ \begin{align*} x_A, x_B & = \textrm{number of rewards from users shown variant } A, B \\ x_A & \sim \textrm{Binomial}(n_A, r_A) \\ x_B & \sim \textrm{Binomial}(n_B, r_B) \\ r_A, r_B & \sim \textrm{Beta}(1, 1) \end{align*} $$
$$ \begin{align*} r_A\ |\ n_A, x_A & \sim \textrm{Beta}(x_A + 1, n_A - x_A + 1) \\ r_B\ |\ n_B, x_B & \sim \textrm{Beta}(x_B + 1, n_B - x_B + 1) \end{align*} $$
Thompson sampling randomizes user/variant assignment according to the probabilty that each variant maximizes the posterior expected reward.
The probability that a user is assigned variant A is
$$ \begin{align*} P(r_A > r_B\ |\ \mathcal{D}) & = \int_0^1 P(r_A > r\ |\ \mathcal{D})\ \pi_B(r\ |\ \mathcal{D})\ dr \\ & = \int_0^1 \left(\int_r^1 \pi_A(s\ |\ \mathcal{D})\ ds\right)\ \pi_B(r\ |\ \mathcal{D})\ dr \\ & \propto \int_0^1 \left(\int_r^1 s^{\alpha_A - 1} (1 - s)^{\beta_A - 1}\ ds\right) r^{\alpha_B - 1} (1 - r)^{\beta_B - 1}\ dr \end{align*} $$
fig
fig
(a_samples > b_samples).mean()
0.24299999999999999
class BetaBinomial:
def __init__(self, a0=1., b0=1.):
self.a = a0
self.b = b0
def sample(self):
return sp.stats.beta.rvs(self.a, self.b)
def update(self, n, x):
self.a += x
self.b += n - x
class Bandit:
def __init__(self, a_post, b_post):
self.a_post = a_post
self.b_post = b_post
def assign(self):
return 1 * (self.a_post.sample() < self.b_post.sample())
def update(self, arm, reward):
arm_post = self.a_post if arm == 0 else self.b_post
arm_post.update(1, reward)
A_RATE, B_RATE = 0.05, 0.1
N = 1000
rewards_gen = generate_rewards(A_RATE, B_RATE, N)
bandit = Bandit(BetaBinomial(), BetaBinomial())
arms = np.empty(N, dtype=np.int64)
rewards = np.empty(N)
for t, arm_rewards in tqdm(enumerate(rewards_gen), total=N):
arms[t] = bandit.assign()
rewards[t] = arm_rewards[arms[t]]
bandit.update(arms[t], rewards[t])
100%|██████████| 1000/1000 [00:00<00:00, 3333.85it/s]
fig