@@ -21,6 +21,29 @@ def __repr__(self):
21
21
params = ", " .join (["{}={}" .format (k , v ) for (k , v ) in HP .items () if k != "id" ])
22
22
return "{}({})" .format (HP ["id" ], params )
23
23
24
+ @property
25
+ def hyperparameters (self ):
26
+ """A dictionary of the bandit hyperparameters"""
27
+ return {}
28
+
29
+ @abstractmethod
30
+ def oracle_payoff (self , context = None ):
31
+ """
32
+ Return the expected reward for an optimal agent.
33
+
34
+ Parameters
35
+ ----------
36
+ context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
37
+ The current context matrix for each of the bandit arms, if
38
+ applicable. Default is None.
39
+
40
+ Returns
41
+ -------
42
+ optimal_rwd : float
43
+ The expected reward under an optimal policy.
44
+ """
45
+ pass
46
+
24
47
def pull (self , arm_id , context = None ):
25
48
"""
26
49
"Pull" (i.e., sample from) a given arm's payoff distribution.
@@ -43,24 +66,6 @@ def pull(self, arm_id, context=None):
43
66
self .step += 1
44
67
return self ._pull (arm_id , context )
45
68
46
- @abstractmethod
47
- def oracle_payoff (self , context = None ):
48
- """
49
- Return the expected reward for an optimal agent.
50
-
51
- Parameters
52
- ----------
53
- context : :py:class:`ndarray <numpy.ndarray>` of shape `(D, K)` or None
54
- The current context matrix for each of the bandit arms, if
55
- applicable. Default is None.
56
-
57
- Returns
58
- -------
59
- optimal_rwd : float
60
- The expected reward under an optimal policy.
61
- """
62
- pass
63
-
64
69
def reset (self ):
65
70
"""Reset the bandit step and action counters to zero."""
66
71
self .step = 0
@@ -69,11 +74,6 @@ def reset(self):
69
74
def _pull (self , arm_id ):
70
75
pass
71
76
72
- @property
73
- def hyperparameters (self ):
74
- """A dictionary of the bandit hyperparameters"""
75
- return {}
76
-
77
77
78
78
class MultinomialBandit (Bandit ):
79
79
def __init__ (self , payoffs , payoff_probs ):
@@ -114,11 +114,6 @@ def hyperparameters(self):
114
114
"payoff_probs" : self .payoff_probs ,
115
115
}
116
116
117
- def _pull (self , arm_id , context ):
118
- payoffs = self .payoffs [arm_id ]
119
- probs = self .payoff_probs [arm_id ]
120
- return np .random .choice (payoffs , p = probs )
121
-
122
117
def oracle_payoff (self , context = None ):
123
118
"""
124
119
Return the expected reward for an optimal agent.
@@ -135,6 +130,11 @@ def oracle_payoff(self, context=None):
135
130
"""
136
131
return self .best_ev
137
132
133
+ def _pull (self , arm_id , context ):
134
+ payoffs = self .payoffs [arm_id ]
135
+ probs = self .payoff_probs [arm_id ]
136
+ return np .random .choice (payoffs , p = probs )
137
+
138
138
139
139
class BernoulliBandit (Bandit ):
140
140
def __init__ (self , payoff_probs ):
@@ -168,9 +168,6 @@ def hyperparameters(self):
168
168
"payoff_probs" : self .payoff_probs ,
169
169
}
170
170
171
- def _pull (self , arm_id , context ):
172
- return int (np .random .rand () <= self .payoff_probs [arm_id ])
173
-
174
171
def oracle_payoff (self , context = None ):
175
172
"""
176
173
Return the expected reward for an optimal agent.
@@ -187,6 +184,9 @@ def oracle_payoff(self, context=None):
187
184
"""
188
185
return self .best_ev
189
186
187
+ def _pull (self , arm_id , context ):
188
+ return int (np .random .rand () <= self .payoff_probs [arm_id ])
189
+
190
190
191
191
class GaussianBandit (Bandit ):
192
192
def __init__ (self , payoff_dists , payoff_probs ):
@@ -286,15 +286,6 @@ def __init__(self, G, start_vertex, end_vertex):
286
286
placeholder = [None ] * len (self .paths )
287
287
super ().__init__ (placeholder , placeholder )
288
288
289
- def _calc_arm_evs (self ):
290
- I2V = self .G .get_vertex
291
- evs = np .zeros (len (self .paths ))
292
- for p_ix , path in enumerate (self .paths ):
293
- for ix , v_i in enumerate (path [:- 1 ]):
294
- e = [e for e in self .adj_dict [v_i ] if e .to == I2V (path [ix + 1 ])][0 ]
295
- evs [p_ix ] -= e .weight
296
- return evs
297
-
298
289
@property
299
290
def hyperparameters (self ):
300
291
"""A dictionary of the bandit hyperparameters"""
@@ -305,15 +296,6 @@ def hyperparameters(self):
305
296
"start_vertex" : self .start_vertex ,
306
297
}
307
298
308
- def _pull (self , arm_id , context ):
309
- reward = 0
310
- I2V = self .G .get_vertex
311
- path = self .paths [arm_id ]
312
- for ix , v_i in enumerate (path [:- 1 ]):
313
- e = [e for e in self .adj_dict [v_i ] if e .to == I2V (path [ix + 1 ])][0 ]
314
- reward -= e .weight
315
- return reward
316
-
317
299
def oracle_payoff (self , context = None ):
318
300
"""
319
301
Return the expected reward for an optimal agent.
@@ -330,6 +312,24 @@ def oracle_payoff(self, context=None):
330
312
"""
331
313
return self .best_ev
332
314
315
+ def _calc_arm_evs (self ):
316
+ I2V = self .G .get_vertex
317
+ evs = np .zeros (len (self .paths ))
318
+ for p_ix , path in enumerate (self .paths ):
319
+ for ix , v_i in enumerate (path [:- 1 ]):
320
+ e = [e for e in self .adj_dict [v_i ] if e .to == I2V (path [ix + 1 ])][0 ]
321
+ evs [p_ix ] -= e .weight
322
+ return evs
323
+
324
+ def _pull (self , arm_id , context ):
325
+ reward = 0
326
+ I2V = self .G .get_vertex
327
+ path = self .paths [arm_id ]
328
+ for ix , v_i in enumerate (path [:- 1 ]):
329
+ e = [e for e in self .adj_dict [v_i ] if e .to == I2V (path [ix + 1 ])][0 ]
330
+ reward -= e .weight
331
+ return reward
332
+
333
333
334
334
class ContextualBernoulliBandit (Bandit ):
335
335
def __init__ (self , context_probs ):
@@ -379,12 +379,6 @@ def get_context(self):
379
379
context [np .random .choice (D ), :] = 1
380
380
return random_one_hot_matrix (1 , D ).ravel ()
381
381
382
- def _pull (self , arm_id , context ):
383
- D , K = self .context_probs .shape
384
- arm_probs = context [:, arm_id ] @ self .context_probs
385
- arm_rwds = (np .random .rand (K ) <= arm_probs ).astype (int )
386
- return arm_rwds [arm_id ]
387
-
388
382
def oracle_payoff (self , context ):
389
383
"""
390
384
Return the expected reward for an optimal agent.
@@ -402,6 +396,12 @@ def oracle_payoff(self, context):
402
396
"""
403
397
return context [:, 0 ] @ self .best_ev
404
398
399
+ def _pull (self , arm_id , context ):
400
+ D , K = self .context_probs .shape
401
+ arm_probs = context [:, arm_id ] @ self .context_probs
402
+ arm_rwds = (np .random .rand (K ) <= arm_probs ).astype (int )
403
+ return arm_rwds [arm_id ]
404
+
405
405
406
406
class ContextualLinearBandit (Bandit ):
407
407
def __init__ (self , K , D , payoff_variance = 1 ):
@@ -484,12 +484,6 @@ def get_context(self):
484
484
"""
485
485
return np .random .normal (size = (self .D , self .K ))
486
486
487
- def _pull (self , arm_id , context ):
488
- K , thetas = self .K , self .thetas
489
- self ._noise = np .random .normal (scale = self .payoff_variance , size = self .K )
490
- self .arm_evs = np .array ([context [:, k ] @ thetas [:, k ] for k in range (K )])
491
- return (self .arm_evs + self ._noise )[arm_id ]
492
-
493
487
def oracle_payoff (self , context ):
494
488
"""
495
489
Return the expected reward for an optimal agent.
@@ -507,3 +501,9 @@ def oracle_payoff(self, context):
507
501
"""
508
502
best_arm = np .argmax (self .arm_evs )
509
503
return self .arm_evs [best_arm ]
504
+
505
+ def _pull (self , arm_id , context ):
506
+ K , thetas = self .K , self .thetas
507
+ self ._noise = np .random .normal (scale = self .payoff_variance , size = self .K )
508
+ self .arm_evs = np .array ([context [:, k ] @ thetas [:, k ] for k in range (K )])
509
+ return (self .arm_evs + self ._noise )[arm_id ]
0 commit comments