基于C++实现人工智能期末项目（黑白棋方案探讨）【100011825】资源-CSDN文库

共6个文件

pdf：2个

cpp：1个

docx：1个

版权申诉

人工智能

课程设计

152 浏览量 2023-04-11 16:18:12 上传评论 1 收藏 2.13MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

100011825-基于C++实现人工智能期末项目（黑白棋方案探讨）.zip （6个子文件）

ai_othello

LICENSE 1KB

lab12 强化学习.pdf 1.03MB

Othello.cpp 22KB

16337327_16337341_zhuzhiru_report.docx 219KB

README.md 10KB

16337327_16337341_zhuzhiru_report.pdf 1.1MB

# 人工智能期末项目 # 一、实验内容 ## 1.1 算法原理 ### 1.1.1 Epsilon-greedy算法 Epsilon-greedy算法是在“exploit”（利用）和“explore”（探索）之间权衡以尽可能实现收益的最大化。假设当前有n台老虎机，每台老虎机吐钱的概率不一样，也不清楚每台老虎机吐钱的概率分布，如何实现收益最大化呢？通常有两种方案，第一种就是在已经探索过的老虎机中找到吐钱最多的那台老虎机，然后坚持摇它，这就是“exploit”（利用）；第二种就是尝试不断探索新的老虎机，在探索过程中可能会找到更好的老虎机，也可能找到吐钱很少的老虎机，这就是“explore”（探索）。Epsilon-greedy算法是每次以epsilon概率进行“explore”（探索），也就是说在所有的老虎机中选择一个进行探索；每次以1 – epsilon概率进行“exploit”（利用），也就是在已经探索过的老虎机中选择收益最大的那个。 Epsilon设置的越高，收敛到最佳收益的速度越快，因为随机搜索的概率越大，越能更快地探索到最好的老虎机。当0.1 < Epsilon < 0.5时，Epsilon设置的越低，最终的平均收益越高，因为每次以较大概率随机探索，将不能有效利用收益最大的老虎机。 ### 1.1.2 Q-learning ![](https://www.writebug.com/myres/static/uploads/2022/7/11/ff1596576fcb2aca7bf73dec51aeee6d.writebug) ### 1.1.3 Sarsa ![](https://www.writebug.com/myres/static/uploads/2022/7/11/df2b2586c443d91389e0f2052eb9dfb0.writebug) ### 1.1.4 遗传算法（队友实现）队友使用的是遗传算法，我并未参与实现，所以在此只介绍相关原理。 ![](https://www.writebug.com/myres/static/uploads/2022/7/11/9faec31eebadbaceec8dd0c61e996a19.writebug) ## 1.2 流程图 ### 1.2.1 Q-learning ![](https://www.writebug.com/myres/static/uploads/2022/7/11/1463b261db076ece4ea2a465ced4f6f2.writebug) ### 1.2.2 Sarsa ![](https://www.writebug.com/myres/static/uploads/2022/7/11/509be4e7c4f2c3a7a3a631f3e7a1c70c.writebug) ## 1.3 关键代码 ### 1.3.1 Q-learning Q-learning在黑白棋中的应用中，每个不同的状态S表示每个不同的棋局，动作a表示下一步下子的位置，动作集A表示可下子位置的集合，状态S的即时回报R表示S所对应棋局的估计值。Q-learning在黑白棋中的步骤为： ![](https://www.writebug.com/myres/static/uploads/2022/7/11/77b3d75149d6d9ee1aafb2fd7291c730.writebug) ```c++ void Q_learning(char board[8][8], char color, void(*function)(char board[8][8], char)) { char opp = color == '@' ? 'O' : '@'; double alpha = 0.8, gama = 0.5; state pre = state(board); default_random_engine e; uniform_real_distribution<double> u(0.0, 1.0); double epsilon = u(e); auto next_step = show_places(board, color); // 根据Epsilon-greedy算法选择动作 if (1 - epsilon < 0.5) { // 以1 – epsilon概率进行“exploit”（利用），在已经探索过的动作中选择收益最大的 double best_value = -10000000; pre.action = next_step[0]; for (int i = 0; i < next_step.size(); ++i) { auto index = state(pre.board, next_step[i]); if (Q.find(index) == Q.end()) continue; else if (Q[index] > best_value) { best_value = Q[index]; pre.action = next_step[i]; }}} else // 以epsilon概率进行“explore”（探索），在所有的动作种随机选择一个 pre.action = next_step[rand() % next_step.size()]; //进入循环 while (!is_over(board)) { // 执行选好的动作 move(board, pre.action, color); if (!show_places(board, opp).empty() && !show_places(board, opp).empty()) function(board, opp); while (!is_over(board) && show_places(board, color).empty()) function(board, opp); // 得到下一个状态 state S = state(board); // 得到下一个状态的即时回报 double reward = evaluate(board, color); next_step = show_places(board, color); // 在下一个状态下找到具有最大Q值的动作 if (!next_step.empty()) { double best_value = -10000000; S.action = next_step[0]; for (int i = 0; i < next_step.size(); ++i) { auto index = state(board, next_step[i]); if (Q.find(index) == Q.end()) continue; else if (Q[index] > best_value) { best_value = Q[index]; S.action = next_step[i]; }}} double new_q = 0, old_q = 0; if (Q.find(pre) != Q.end()) old_q = Q[pre]; if (Q.find(S) != Q.end()) new_q = Q[S]; // 更新表Q Q[pre] = old_q + alpha * (reward + gama * new_q - old_q); // 以epsilon概率进行“explore”（探索） epsilon = u(e); if (!next_step.empty() && epsilon < 0.5) S.action = next_step[rand() % next_step.size()]; pre = S;}} ``` ### 1.3.2 Sarsa Sarsa在黑白棋的应用中，状态S、动作a、动作集A、即时回报R所表示的意义与Q-learning在黑白棋的应用中的相同，Sarsa在在黑白棋中的步骤为： ![](https://www.writebug.com/myres/static/uploads/2022/7/11/d2eb932e429619c360524d5b3b5c5f29.writebug) ```c++ void Sarsa(char board[8][8], char color, void (* function)(char board[8][8], char)) { char opp = color == '@' ? 'O' : '@'; double alpha = 0.8, gama = 0.5; state pre = state(board); default_random_engine e; uniform_real_distribution<double> u(0.0, 1.0); double epsilon = u(e); auto next_step = show_places(board, color); // 根据Epsilon-greedy算法选择动作 if (1 - epsilon < 0.5) { // 以1 – epsilon概率进行“exploit”（利用），在已经探索过的动作中选择收益最大的 double best_value = -10000000; pre.action = next_step[0]; for (int i = 0; i < next_step.size(); ++i) { auto index = state(pre.board, next_step[i]); if (Sarsa_Q.find(index) == Sarsa_Q.end()) continue; else if (Sarsa_Q[index] > best_value) { best_value = Sarsa_Q[index]; pre.action = next_step[i]; }}} else // 以epsilon概率进行“explore”（探索），在所有的动作种随机选择一个 pre.action = next_step[rand() % next_step.size()]; // 进入循环 while (!is_over(board)) { // 执行已选好的动作 move(board, pre.action, color); if (!show_places(board, opp).empty() && !show_places(board, opp).empty()) function(board, opp); while (!is_over(board) && show_places(board, color).empty()) function(board, opp); // 得到下一个状态 state S = state(board); // 得到下一个状态的即时回报 double reward = evaluate(board, color); epsilon = u(e); next_step = show_places(board, color); // 根据Epsilon-greedy算法选择动作 if (!next_step.empty()) if (1 - epsilon < 0.5) { // 以1 – epsilon概率进行“exploit”（利用） double best_value = -10000000; S.action = next_step[0]; for (int i = 0; i < next_step.size(); ++i) { auto index = state(board, next_step[i]); if (Sarsa_Q.find(index) == Sarsa_Q.end()) continue; else if (Sarsa_Q[index] > best_value) {

评论收藏

内容反馈

版权申诉