職場で開催中の「多腕バンディットアルゴリズム勉強会」の復習として実装してみた。オライリーのバンディットアルゴリズムによる最適化手法を参考にしたが、本のサンプルコードがあまりにも読みづらいので、写経のつもりが大幅な書き直しになった。
現段階ではまだ簡単な問題設定でしかない。これが腕が増減したり、同じ腕でも得られる報酬が変化するようなパターンだとどうなるのかが気になる。
In [20]:
%matplotlib inline
import matplotlib.pylab as plt
from numpy import zeros, array, average, random as nprandom
import random
テスト実行と結果プロット用の処理¶
In [21]:
def test_algorithm(algo, arms, num_sims, horizon):
chosen_arms = zeros((num_sims, horizon), dtype=int) # 試行毎に選んだ腕
rewards = zeros((num_sims, horizon)) # 試行毎に得られた報酬
cumulative_rewards = zeros((num_sims, horizon)) # 累積報酬
for sim in range(num_sims):
algo.initialize(len(arms))
for t in range(horizon):
chosen_arm = algo.select_arm()
chosen_arms[sim, t] = chosen_arm
reward = arms[chosen_arm].draw()
rewards[sim, t] = reward
algo.update(chosen_arm, reward)
cumulative_rewards[sim] = rewards[sim].cumsum()
return chosen_arms, rewards, cumulative_rewards
In [22]:
def plot_results(simulate_num, horizon, best_arm, results):
fig1, axes1 = plt.subplots(nrows=1, ncols=3, figsize=(11, 5))
x = range(horizon)
plot1, plot2, plot3 = axes1[0], axes1[1], axes1[2]
for result in test_results:
accuracy = zeros(horizon)
reward_ave = zeros(horizon)
cumulative_rewards = zeros(horizon)
epsilon = result[0]
chosen_arms_mat = result[1]
rewards_mat = result[2]
cumulative_rewards_mat = result[3]
for i in xrange(horizon):
best_arm_selected_count = len(filter(lambda choice: choice == best_arm, chosen_arms_mat[:,i]))
accuracy[i] = best_arm_selected_count / float(simulate_num)
reward_ave[i] = average(rewards_mat[:,i])
cumulative_rewards[i] = average(cumulative_rewards_mat[:,i])
plot1.plot(x, accuracy, label='%10.2f' % epsilon)
plot2.plot(x, reward_ave, label='%10.2f' % epsilon)
plot3.plot(x, cumulative_rewards, label='%10.2f' % epsilon)
plot1.legend(loc=4)
plot1.set_xlabel('Time')
plot1.set_ylabel('Probability of Selecting Best Arm')
plot1.set_title('Accuracy of the \nEpsilon Greedy Algorithm')
plot2.legend(loc=4)
plot2.set_xlabel('Time')
plot2.set_ylabel('Average Reward')
plot2.set_title('Performance of the \nEpsilon Greedy Algorithm')
plot3.legend(loc=4)
plot3.set_xlabel('Time')
plot3.set_ylabel('Cumulative Reward of Chosen Arm')
plot3.set_title('Cumulative Reward of the \nEpsilon Greedy Algorithm')
Epsilon-Greedyアルゴリズムの実装¶
In [23]:
class EpsilonGreedy(object):
def __init__(self, epsilon):
self.epsilon = epsilon # 探索する度合い
self.counts = None
self.values = None
def initialize(self, n_arms):
# 腕を何回引いたか
self.counts = zeros(n_arms, dtype=int)
# 引いた腕の報酬の平均値
self.values = zeros(n_arms)
def select_arm(self):
""" 次に選択する腕のインデックスを返す """
if self.epsilon > random.random():
# epsilonの確率で探索を行なう
return random.randrange(len(self.values))
else:
# それ以外は活用を行なう
return self.values.argmax()
def update(self, chosen_arm, reward):
# 腕を選んだ回数をインクリメント
self.counts[chosen_arm] += 1
n = self.counts[chosen_arm]
# 腕の平均報酬額を更新
value = self.values[chosen_arm]
new_value = ((n-1)/float(n)) * value + (1/float(n)) * reward
self.values[chosen_arm] = new_value
腕の実装¶
In [24]:
class BernoulliArm():
""" ベルヌーイ分布に基づいて報酬を返す腕 """
def __init__(self, p):
self.p = p
def draw(self):
# 確率pで1.0を返す
return 1.0 if self.p > random.random() else 0.0
実行¶
In [25]:
SIMULATE_NUM = 5000
HORIZON = 250
means = [0.1, 0.1, 0.1, 0.1, 0.9]
random.shuffle(means)
arms = map(lambda mu: BernoulliArm(mu), means)
best_arm = array(means).argmax()
test_results = []
for epsilon in [0.1, 0.2, 0.3, 0.4, 0.5]:
algo = EpsilonGreedy(epsilon)
chosen_arms_mat, rewards_mat, cumulative_rewards_mat = test_algorithm(algo, arms, SIMULATE_NUM, HORIZON)
test_results.append([epsilon, chosen_arms_mat, rewards_mat, cumulative_rewards_mat])
plot_results(SIMULATE_NUM, HORIZON, best_arm, test_results)
練習問題¶
In [26]:
# 腕を200本にした場合
SIMULATE_NUM = 5000
HORIZON = 250
means = nprandom.rand(200)
random.shuffle(means)
arms = map(lambda mu: BernoulliArm(mu), means)
best_arm = array(means).argmax()
test_results = []
for epsilon in [0.1, 0.2, 0.3, 0.4, 0.5]:
algo = EpsilonGreedy(epsilon)
chosen_arms_mat, rewards_mat, cumulative_rewards_mat = test_algorithm(algo, arms, SIMULATE_NUM, HORIZON)
test_results.append([epsilon, chosen_arms_mat, rewards_mat, cumulative_rewards_mat])
plot_results(SIMULATE_NUM, HORIZON, best_arm, test_results)