Hallo zusammen,
Ich bin am verzweifeln.
Ich trainiere gerade einen Tabular Q-Learning Agenten für Flappy-Bird-Gymnasium und weiß nicht woran es liegt das der Agent nicht besser lernt. Liegt es an Problemen mit der Diskretisierung, dem Algorithmus, Reward System oder den Hyperparametern?
In meinem Durchlauf gerade war die höchste Anzahl an Pipes 52 und ein Average von 4 Pipes vllt.
Episode: 95000, Score: 56, Best Score: 1986, Best Pipes: 52, Avg Score: 145.90 (4 Pipes ca.) , Alpha: 0.0970, Q-Table Size: 177
Der Score ist hier bei die Anzahl an Frames die der Agent überlebt hat.
Ich benutze Eligibility Traces und Alpha decay bei einem hohen Avg Score.
Aus Frust habe ich paar mal AI Agents drüberlaufen lassen über den Code und es wurde nicht wirklich besser.
Deswegen vielleicht kennt sich einer von euch hiermit aus, ich bin neu dabei und kam auf die dumme Idee gelerntes aus der Vorlesung anwenden zu wollen haha.
Das ist train.py:
# train.py
import gymnasium
import flappy_bird_gymnasium
import numpy as np
import random
import utils
import pickle
import os
from collections import defaultdict
env = gymnasium.make("FlappyBird-v0", render_mode=None, use_lidar=False)
# Hyperparameters
episodes = 100000
alpha = 0.1 # Learning rate
gamma = 0.99 # Discount factor
epsilon = 0.0 # Greedy policy (exploration via optimistic init)
init_q = 0.1 # Optimistic Initialization
lam = 0.8 # Lambda for eligibility traces
trace_min = 0.01 # Threshold to prune traces
# Q-Table as defaultdict
# Returns a list of [Q(s,0), Q(s,1)] initialized to init_q
q_table = defaultdict(lambda: [init_q, init_q])
# Training Metrics
scores_history = []
best_score = 0
best_pipes = 0
print(f"Starting Training: Q-Learning with Traces (Alpha={alpha}, Gamma={gamma}, InitQ={init_q}, Lambda={lam})")
try:
for episode in range(episodes):
state, _ = env.reset()
current_state_key = utils.get_discrete_state(state)
# Eligibility Traces: Map state -> [trace_action_0, trace_action_1]
traces = defaultdict(lambda: [0.0, 0.0])
terminated = False
truncated = False
step_count = 0
total_reward = 0
score = 0 # Pipes passed
while not (terminated or truncated):
# Select Action: Greedy
q_values = q_table[current_state_key]
if q_values[0] == q_values[1]:
action = random.choice([0, 1])
else:
action = np.argmax(q_values)
next_state, reward, terminated, truncated, info = env.step(action)
# Custom Reward Logic
if terminated:
reward = -1000
elif reward >= 1.0: # Passed a pipe
reward = 50.0
score += 1
else:
reward = 1 # Survival reward per frame
total_reward += reward
# Discretize Next State
next_state_key = utils.get_discrete_state(next_state)
# --- Q-Learning with Eligibility Traces ---
# dynamic alpha
#current_alpha = alpha
#if score >= 10:
# reward = 1000
# current_alpha = 0.2 # Boost learning for successful runs
# 1. Calculate TD Error
# delta = R + gamma * max_a(Q(s', a)) - Q(s, a)
current_q = q_table[current_state_key][action]
max_next_q = np.max(q_table[next_state_key])
delta = reward + gamma * max_next_q - current_q
# 2. Update Eligibility Trace for current state (Replacing Traces)
traces[current_state_key][action] = 1.0
# 3. Update Q-values and Decay Traces for ALL active states
keys_to_remove = []
for state_key, trace_values in traces.items():
# We update both actions for the state if they have traces
for a in range(2):
if trace_values[a] > trace_min:
# Update Q-Value
q_table[state_key][a] += alpha * delta * trace_values[a]
# Decay Trace
traces[state_key][a] *= (gamma * lam)
else:
# Just ensure it's zero if it fell below threshold logic previously
traces[state_key][a] = 0.0
# If both traces are practically zero, mark for removal to keep dict small
if traces[state_key][0] <= trace_min and traces[state_key][1] <= trace_min:
keys_to_remove.append(state_key)
# Cleanup
for key in keys_to_remove:
del traces[key]
# Move to next state
current_state_key = next_state_key
step_count += 1
scores_history.append(step_count)
if step_count > best_score:
best_score = step_count
if score > best_pipes:
best_pipes = score
# Logging
if episode % 1000 == 0:
avg_score = np.mean(scores_history[-100:]) if scores_history else 0
# Alpha Decay on High Score
if avg_score > 120.0:
alpha = max(0.005, alpha * 0.99)
print(f"High Avg Score! Decayed Alpha to {alpha:.4f}")
print(f"Episode: {episode}, Score: {step_count}, Best Score: {best_score}, Best Pipes: {best_pipes}, Avg Score: {avg_score:.2f}, Pipe: {score}, Alpha: {alpha:.4f}, Q-Table Size: {len(q_table)}")
except KeyboardInterrupt:
print("Training Interrupted by User")
# Final Save
with open("q_table.pkl", "wb") as f:
pickle.dump(dict(q_table), f)
print("Training Completed & Saved to q_table.pkl")
env.close()
Alles anzeigen
utils.py für Diskretisierung
# utils.py
import math
def get_discrete_state(state):
"""
Discretizes the state using the FlapAI Bird paper methodology:
discretized = rounding * math.floor(value / rounding)
State mapping from FlappyBird-v0:
0: last_pipe_x
1: last_pipe_y
2: last_pipe_y_bottom
3: next_pipe_x (dist_x)
4: next_pipe_y_top
5: next_pipe_y_bottom
6: next_next_pipe_x
7: next_next_pipe_y_top
8: next_next_pipe_y_bottom
9: player_y
10: player_vel
11: player_rot
"""
# Extract raw features
dist_x = state[3]
dist_y = state[9] - state[5]
velocity = state[10]
# Discretization parameters
round_x = 0.1
round_y = 0.1
round_v = 1.0
# Apply formula
d_x = round_x * math.floor(dist_x / round_x)
d_y = round_y * math.floor(dist_y / round_y)
d_v = round_v * math.floor(velocity / round_v)
# Return as tuple for dictionary key
return (round(d_x, 2), round(d_y, 2), round(d_v, 2))
Alles anzeigen
Mfg