QLearner3.1

Author	Ethan
Submission date	2019-03-15 02:38:35.581692
Rating	6603
Matches played	254
Win rate	65.35
Use rpsrunner.py to play unranked matches on your computer.
Source code:

import random

moves = {"R": 0, "P": 1, "S": 2}
moves_inv = {0: "R", 1: "P", 2: "S"}

#(opponent, bot move) = reward
reward = {(0,0): 0, (0, 1): 1, (0,2): -1, (1, 0): -1, (1,1): 0, (1,2): 1, (2,0): 1, (2,1): -1, (2,2): 0}

def zero_array(shape):
    result = []
    for _ in range(0, shape[0]):
        if len(shape) == 1:
            result.append(0)
        else:
            result.append(zero_array(shape[1:]))
    return result

class Bot:
    def __init__(self, alpha, gamma, num_past_moves):
        self.past_moves = []
        #Q[a][b][c][d] ... Q for action d with last 3 moves a,b,c of opponent
        self.Q = zero_array([3] * (num_past_moves + 1))
        self.alpha = alpha
        self.gamma = gamma
        self.num_past_moves = num_past_moves

    def next_move(self):
        #Do random move for first couple of moves
        if len(self.past_moves) < self.num_past_moves:
            return moves_inv[random.randint(0, 2)]
        
        #Pick action with best Q given past couple of moves of opponent
        current_Q = self.Q
        for prev_move in self.past_moves:
            current_Q = current_Q[prev_move]
        
        max_a = 0
        ties = [max_a]
        for a in range(0, 3):
            if current_Q[a] > current_Q[max_a]:
                max_a = a
                ties = [max_a]
            elif current_Q[a] == current_Q[max_a]:
                ties.append(a)

        #Pick random move in event of tie
        return moves_inv[random.choice(ties)]

    #Update takes the numerical version of the move ... not the string
    def update(self, opponent_move, current_move):
        if len(self.past_moves) == self.num_past_moves:    
            reward_outcome = reward[(opponent_move, current_move)]

            #update Q
            current_Q = self.Q
            for prev_move in self.past_moves:
                current_Q = current_Q[prev_move]

            future_states = self.Q
            for prev_move in self.past_moves[1:]:
                future_states = future_states[prev_move]

            possible_future_states = [max(future_states[0]),max(future_states[1]),max(future_states[2])]
            avg_future_Q = sum(possible_future_states) / float(len(possible_future_states))
            current_Q[current_move] = (1 - self.alpha) * current_Q[current_move] + self.alpha * (reward_outcome + self.gamma * avg_future_Q)

            #Pop off first item so that this move will be added to list of past moves
            self.past_moves.pop(0)

        #update previous moves list
        self.past_moves.append(opponent_move)
        
        
if input == "":
    bot1 = Bot(0.5, 0.99, 5)
    output = str(bot1.next_move())
else:
    bot1.update(moves[str(input)], moves[str(output)])
    output = str(bot1.next_move())