{"id":1727,"date":"2023-04-18T17:38:22","date_gmt":"2023-04-18T09:38:22","guid":{"rendered":"https:\/\/www.aqwu.net\/wp\/?p=1727"},"modified":"2023-04-18T17:38:22","modified_gmt":"2023-04-18T09:38:22","slug":"%e5%a6%82%e4%bd%95%e8%ae%ad%e7%bb%83%e4%b8%80%e4%b8%aa%e8%87%aa%e5%b7%b1%e7%9a%84gpt%e6%a8%a1%e5%9e%8b","status":"publish","type":"post","link":"https:\/\/www.aqwu.net\/wp\/?p=1727","title":{"rendered":"\u5982\u4f55\u8bad\u7ec3\u4e00\u4e2a\u81ea\u5df1\u7684GPT\u6a21\u578b"},"content":{"rendered":"\n<h1 class=\"wp-block-heading\">\u4e00\u3001\u6982\u8ff0<\/h1>\n\n\n\n<h3 class=\"wp-block-heading\">1\u3001\u4ec0\u4e48\u662fGPT<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">GPT\uff0c\u5168\u79f0\u201cGenerative Pre-training Transformer\u201d\uff0c\u5373\u751f\u6210\u5f0f\u9884\u8bad\u7ec3\u53ef\u8fc1\u79fb\u6ce8\u610f\u529b\u6a21\u578b\uff0c\u7b80\u79f0\u201c\u6587\u672c\u751f\u6210\u5668\u201d\u3002\u672c\u8d28\u4e0a\u662fNLP\u81ea\u7136\u8bed\u8a00\u5904\u7406\u9886\u57df\u91c7\u7528Transformer\u7684attention\u6ce8\u610f\u529b\u673a\u5236\u89e3\u51b3\u975e\u7cbe\u786e\u7684\u6a21\u7cca\u5316\u8bed\u4e49\u95ee\u9898\uff0c\u4ece\u800c\u5e26\u6765\u4e86NLP\u9886\u57df\u7684\u91cc\u7a0b\u7891\u5f0f\u6539\u53d8\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Transformer\u7684attention\u673a\u5236\u7684\u6838\u5fc3\u662f\u901a\u8fc7\u7f16\u89e3\u7801\uff0c\u5c06\u4eba\u7c7b\u8bed\u8a00\u89e3\u6790\u6210\u53ca\u5176\u53ef\u4ee5\u7406\u89e3\u7684\u6570\u5b57\uff0c\u6a21\u578b\u901a\u8fc7\u6ce8\u610f\u529b\u673a\u5236\u5bfb\u627e\u5411\u91cf\u4e4b\u95f4\u7684\u5173\u7cfb\u5e76\u751f\u6210\u6a21\u578b\u53c2\u6570\uff0c\u518d\u6839\u636e\u53c2\u6570\uff0c\u901a\u8fc7\u7f16\u7801\u5668\u751f\u6210\u6211\u4eec\u7b26\u5408\u9884\u671f\u7684\u7ed3\u679c\uff0c\u53c2\u6570\u91cf\u8d8a\u5927\uff0c\u7ed3\u679c\u8d8a\u51c6\u786e\u3002\u53c2\u6570\u6700\u521d\u662f\u4eba\u4e3a\u6807\u6ce8\u6570\u636e\u5f62\u6210\u7684\uff0c\u4e4b\u540e\u901a\u8fc7\u5f3a\u5316\u5b66\u4e60+\u5956\u52b1\u673a\u5236\u81ea\u52a8\u751f\u6210\u3002\u4ece\u4e0b\u56fe\u53ef\u4ee5\u770b\u5230\uff0c\u5b83\u4e0e\u68cb\u7c7b\u7684\u5f3a\u5316\u5b66\u4e60\u540e\u9884\u6d4b\u4e0b\u6b65\u8d70\u6cd5\u7c7b\u4f3c\uff0c\u53ea\u4e0d\u8fc7\u9884\u6d4b\u7684\u662f\u4e0b\u4e00\u4e2a\u5e94\u8be5\u51fa\u73b0\u7684Word\u5355\u8bcd\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><a href=\"https:\/\/blog.yanjingang.com\/wp-content\/uploads\/2023\/02\/gpt-yl.jpeg\"><img decoding=\"async\" src=\"https:\/\/blog.yanjingang.com\/wp-content\/uploads\/2023\/02\/gpt-yl-1024x597.jpeg\" alt=\"\" class=\"wp-image-7176\"\/><\/a><\/figure>\n\n\n\n<h3 class=\"wp-block-heading\">2\u3001GPT\u7684\u53d1\u5c55<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">GPT\u4e8e2018\u5e74\u7531\u57c3\u9686\u00b7\u9a6c\u65af\u514b\u8054\u5408\u521b\u529e\u7684\u4eba\u5de5\u667a\u80fd\u7814\u7a76\u5b9e\u9a8c\u5ba4OpenAI\u53d1\u5e03\uff0c\u662f\u4e00\u79cd\u81ea\u56de\u5f52\u8bed\u8a00\u6a21\u578b\uff0c\u8fd9\u79cd\u6a21\u578b\u5229\u7528\u6df1\u5ea6\u5b66\u4e60\u4ea7\u751f\u7c7b\u4f3c\u4e8e\u4eba\u7c7b\u8bed\u8a00\u7684\u6587\u672c\u3002\u901a\u4fd7\u7684\u89e3\u91ca\uff0cGPT\u5c31\u662f\u4e00\u79cd\u4f1a\u4e0d\u65ad\u5b66\u4e60\u5e76\u81ea\u884c\u5b8c\u6210\u6587\u5b57\u76f8\u5173\u5de5\u4f5c\u7684\u7535\u8111\u7a0b\u5e8f\uff0c\u4e14\u5728\u5b66\u4e60\u8fc7\u7a0b\u4e2d\u65e0\u9700\u4efb\u4f55\u5916\u754c\u4eba\u5458\u64cd\u4f5c\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u73b0\u5982\u4eca\uff0cGPT\u5df2\u7ecf\u53d1\u5c55\u5230\u7b2c3.5\u4ee3\uff0c\u5373GPT-3.5\uff0c\u5927\u5bb6\u76ee\u524d\u7528\u5230\u7684ChatGPT\u5c31\u662f\u8fd9\u4e2a\u7248\u672c\u3002\u4e0e2018\u5e74\u7b2c\u4e00\u7248GPT 1.17\u4ebf\u7684\u53c2\u6570\u76f8\u6bd4\uff0cGPT-3.5\u62e5\u6709\u77401750\u4ebf\u7684\u53c2\u6570\u91cf\u3002\u8fd9\u4e4b\u95f4\u5de8\u5927\u7684\u53c2\u6570\u5dee\u8ddd\uff0c\u6781\u5927\u7a0b\u5ea6\u63d0\u5347\u4e86GPT\u7684\u8fd0\u884c\u89c4\u6a21\uff0c\u8ba9GPT\u5f00\u59cb\u80fd\u5b8c\u6210\u4e00\u7cfb\u5217\u4ee4\u4eba\u96be\u4ee5\u7f6e\u4fe1\u7684\u4efb\u52a1\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u73b0\u9636\u6bb5\u7684GPT\u5df2\u7ecf\u80fd\u505a\u5230\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>1.\u4f5c\u4e3a\u57fa\u4e8e\u95ee\u9898\u7684\u641c\u7d22\u5f15\u64ce\uff08\u7c7b\u4f3c\u767e\u5ea6\uff09\uff1b<\/li>\n\n\n\n<li>2.\u80fd\u4f5c\u4e3a\u5386\u53f2\u4eba\u7269\u4e0e\u5176\u4ea4\u8c08\u7684\u804a\u5929\u673a\u5668\u4eba\uff1b<\/li>\n\n\n\n<li>3.\u56de\u7b54\u533b\u7597\u95ee\u9898\uff1b<\/li>\n\n\n\n<li>4.\u8c31\u5199\u5409\u4ed6\u66f2\u8c31\uff1b<\/li>\n\n\n\n<li>5.\u82f1\u6587\u7ffb\u8bd1\uff1b<\/li>\n\n\n\n<li>6.\u81ea\u52a8\u521b\u4f5c\u5982\u5c0f\u8bf4\uff1b<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">\u7b49\u7b49\u4e00\u7cfb\u5217\u590d\u6742\u5de5\u4f5c\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">GPT-4\u9884\u8ba1\u5c06\u572823\u5e74\u521d\u53d1\u5e03\uff0c\u76f8\u8f83\u4e8eGPT-3.5\uff0cGPT-4\u7684\u6027\u80fd\u5c06\u6709\u7740\u8df3\u8dc3\u5f0f\u7684\u63d0\u5347\uff0c\u53c2\u6570\u91cf\u5c06\u5927\u4e8e1\u4e07\u4ebf\uff0c\u5df2\u901a\u8fc7\u56fe\u7075\u6d4b\u8bd5\uff0c\u66f4\u6709\u4f20\u8a00\u8868\u793aGPT-4\u5148\u8fdb\u5230\u51e0\u4e4e\u4e0e\u4eba\u7c7b\u6ca1\u6709\u533a\u522b\u7684\u6c34\u5e73\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><a href=\"https:\/\/blog.yanjingang.com\/wp-content\/uploads\/2023\/02\/gpt-4.png\"><img decoding=\"async\" src=\"https:\/\/blog.yanjingang.com\/wp-content\/uploads\/2023\/02\/gpt-4.png\" alt=\"\" class=\"wp-image-7106\"\/><\/a><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">\u5f53\u7136\uff0c\u4ece\u96f6\u5f00\u59cb\u8bad\u7ec3\u4e00\u4e2aGPT\u6a21\u578b\u7684\u6210\u672c\u662f\u5de8\u5927\u7684\uff0cGPT3\u6a21\u578b\u5b9e\u9645\u662f\u57283w\u5f20A100\u5361\u4e0a\u8bad\u7ec3\u7684\u3002\u5982\u679c\u6210\u672c\u5e95\u70b9\uff0c\u57281000\u5f20A100\u5361\u4e0a\u8bad\u7ec3\u4e00\u6b21\u9700\u89812\u4e2a\u6708\uff0c\u8d2d\u4e70GPU\u548c\u914d\u5957+\u673a\u623f\u6258\u7ba1+\u7535\u8d39\u6210\u672c\u81f3\u5c11\u4e5f\u89811-2\u4ebf\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u53ef\u662f\u4f5c\u4e3a\u7a77\u4eba\uff0c\u96be\u9053\u5c31\u4e0d\u80fd\u5b66\u4e60\u4e86\u5417\uff1f\u5f53\u7136\u662f\u53ef\u4ee5\u7684\uff0c\u672c\u6587\u5c31\u5e26\u4f60\u4e00\u8d77\u6765\u5b66\u4e60\u5982\u4f55\u5728\u666e\u901aGPU\u4e0a\u8bad\u7ec3GPT\u6a21\u578b\u3002<\/p>\n\n\n\n<h1 class=\"wp-block-heading\">\u4e8c\u3001\u8bed\u6599\u51c6\u5907<\/h1>\n\n\n\n<p class=\"wp-block-paragraph\">\u8981\u60f3\u8bad\u7ec3\u51fa\u4e00\u4e2a\u597d\u7684\u6a21\u578b\uff0c\u9700\u8981\u5582\u5165\u4f18\u8d28\u7684\u8bed\u6599\uff0c\u4ee5\u4e0b\u662f\u4ece\u7f51\u4e0a\u6536\u96c6\u5230\u7684\u90e8\u5206\u8bed\u6599\u6570\u636e\u96c6\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">1\u3001THUCNews<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">\u6e05\u534e\u5927\u5b66\u81ea\u7136\u8bed\u8a00\u5904\u7406\u4e0e\u793e\u4f1a\u4eba\u6587\u8ba1\u7b97\u5b9e\u9a8c\u5ba4THUCNews\u4e2d\u6587\u6587\u672c\u6570\u636e\u96c6<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/thunlp.oss-cn-qingdao.aliyuncs.com\/THUCNews.zip\">https:\/\/thunlp.oss-cn-qingdao.aliyuncs.com\/THUCNews.zip<\/a><\/p>\n\n\n\n<h4 class=\"wp-block-heading\">2\u3001\u7ef4\u57fa\u767e\u79d1<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/pan.baidu.com\/s\/1uPMlIY3vhusdnhAge318TA\">https:\/\/pan.baidu.com\/s\/1uPMlIY3vhusdnhAge318TA<\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">json\u7248(wiki2019zh)\uff0c104\u4e07\u4e2a\u8bcd\u6761(1,043,224\u6761; \u539f\u59cb\u6587\u4ef6\u5927\u5c0f1.6G\uff0c\u538b\u7f29\u6587\u4ef6519M\uff1b\u6570\u636e\u66f4\u65b0\u65f6\u95f4\uff1a2019.2.7)\uff0c\u53ef\u4ee5\u505a\u4e3a\u901a\u7528\u4e2d\u6587\u8bed\u6599\uff0c\u505a\u9884\u8bad\u7ec3\u7684\u8bed\u6599\u6216\u6784\u5efa\u8bcd\u5411\u91cf\uff0c\u4e5f\u53ef\u4ee5\u7528\u4e8e\u6784\u5efa\u77e5\u8bc6\u95ee\u7b54\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">3\u3001\u65b0\u95fb\u8bed\u6599<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/drive.google.com\/file\/d\/1TMKu1FpTr6kcjWXWlQHX7YJsMfhhcVKp\/view?usp=sharing\">https:\/\/drive.google.com\/file\/d\/1TMKu1FpTr6kcjWXWlQHX7YJsMfhhcVKp\/view?usp=sharing<\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">json\u7248(news2016zh)\uff0c\u5305\u542b\u4e86250\u4e07\u7bc7\u65b0\u95fb\u3002\u65b0\u95fb\u6765\u6e90\u6db5\u76d6\u4e866.3\u4e07\u4e2a\u5a92\u4f53\uff0c\u542b\u6807\u9898\u3001\u5173\u952e\u8bcd\u3001\u63cf\u8ff0\u3001\u6b63\u6587( \u539f\u59cb\u6570\u636e9G\uff0c\u538b\u7f29\u6587\u4ef63.6G\uff1b\u65b0\u95fb\u5185\u5bb9\u8de8\u5ea6\uff1a2014-2016\u5e74)\u3002\u8bad\u7ec3\u96c6\uff1a243\u4e07\uff1b\u9a8c\u8bc1\u96c6\uff1a7.7\u4e07\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u53ef\u4ee5\u505a\u4e3a\u3010\u901a\u7528\u4e2d\u6587\u8bed\u6599\u3011\uff0c\u8bad\u7ec3\u3010\u8bcd\u5411\u91cf\u3011\u6216\u505a\u4e3a\u3010\u9884\u8bad\u7ec3\u3011\u7684\u8bed\u6599\uff1b \u4e5f\u53ef\u4ee5\u7528\u4e8e\u8bad\u7ec3\u3010\u6807\u9898\u751f\u6210\u3011\u6a21\u578b\uff0c\u6216\u8bad\u7ec3\u3010\u5173\u952e\u8bcd\u751f\u6210\u3011\u6a21\u578b\uff08\u9009\u5173\u952e\u8bcd\u5185\u5bb9\u4e0d\u540c\u4e8e\u6807\u9898\u7684\u6570\u636e\uff09\uff1b \u4ea6\u53ef\u4ee5\u901a\u8fc7\u65b0\u95fb\u6e20\u9053\u533a\u5206\u51fa\u65b0\u95fb\u7684\u7c7b\u578b\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">4\u3001\u767e\u79d1\u7c7b\u95ee\u7b54<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">https:\/\/pan.baidu.com\/s\/12TCEwC_Q3He65HtPKN17cA\u3000\u3000# fu45<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">json\u7248(baike2018qa)\uff0c\u542b\u6709150\u4e07\u4e2a\u9884\u5148\u8fc7\u6ee4\u8fc7\u7684\u3001\u9ad8\u8d28\u91cf\u95ee\u9898\u548c\u7b54\u6848\uff0c\u6bcf\u4e2a\u95ee\u9898\u5c5e\u4e8e\u4e00\u4e2a\u7c7b\u522b\u3002\u603b\u5171\u6709492\u4e2a\u7c7b\u522b\uff0c\u5176\u4e2d\u9891\u7387\u8fbe\u5230\u6216\u8d85\u8fc710\u6b21\u7684\u7c7b\u522b\u6709434\u4e2a\u3002\u8bad\u7ec3\u96c6\uff1a142.5\u4e07\uff1b\u9a8c\u8bc1\u96c6\uff1a4.5\u4e07\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u53ef\u4ee5\u505a\u4e3a\u901a\u7528\u4e2d\u6587\u8bed\u6599\uff0c\u8bad\u7ec3\u8bcd\u5411\u91cf\u6216\u505a\u4e3a\u9884\u8bad\u7ec3\u7684\u8bed\u6599\uff1b\u4e5f\u53ef\u4ee5\u7528\u4e8e\u6784\u5efa\u767e\u79d1\u7c7b\u95ee\u7b54\uff1b\u5176\u4e2d\u7c7b\u522b\u4fe1\u606f\u6bd4\u8f83\u6709\u7528\uff0c\u53ef\u4ee5\u7528\u4e8e\u505a\u76d1\u7763\u8bad\u7ec3\uff0c\u4ece\u800c\u6784\u5efa \u66f4\u597d\u53e5\u5b50\u8868\u793a\u7684\u6a21\u578b\u3001\u53e5\u5b50\u76f8\u4f3c\u6027\u4efb\u52a1\u7b49\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">5\u3001\u793e\u533a\u95ee\u7b54<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/drive.google.com\/open?id=1u2yW_XohbYL2YAK6Bzc5XrngHstQTf0v\">https:\/\/drive.google.com\/open?id=1u2yW_XohbYL2YAK6Bzc5XrngHstQTf0v<\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">json\u7248(webtext2019zh) \u5927\u89c4\u6a21\u9ad8\u8d28\u91cf\u6570\u636e\u96c6,\u542b\u6709410\u4e07\u4e2a\u9884\u5148\u8fc7\u6ee4\u8fc7\u7684\u3001\u9ad8\u8d28\u91cf\u95ee\u9898\u548c\u56de\u590d\u3002\u6bcf\u4e2a\u95ee\u9898\u5c5e\u4e8e\u4e00\u4e2a\u3010\u8bdd\u9898\u3011\uff0c\u603b\u5171\u67092.8\u4e07\u4e2a\u5404\u5f0f\u8bdd\u9898\uff0c\u8bdd\u9898\u5305\u7f57\u4e07\u8c61\u3002\u4ece1400\u4e07\u4e2a\u539f\u59cb\u95ee\u7b54\u4e2d\uff0c\u7b5b\u9009\u51fa\u81f3\u5c11\u83b7\u5f973\u4e2a\u70b9\u8d5e\u4ee5\u4e0a\u7684\u7684\u7b54\u6848\uff0c\u4ee3\u8868\u4e86\u56de\u590d\u7684\u5185\u5bb9\u6bd4\u8f83\u4e0d\u9519\u6216\u6709\u8da3\uff0c\u4ece\u800c\u83b7\u5f97\u9ad8\u8d28\u91cf\u7684\u6570\u636e\u96c6\u3002\u9664\u4e86\u5bf9\u6bcf\u4e2a\u95ee\u9898\u5bf9\u5e94\u4e00\u4e2a\u8bdd\u9898\u3001\u95ee\u9898\u7684\u63cf\u8ff0\u3001\u4e00\u4e2a\u6216\u591a\u4e2a\u56de\u590d\u5916\uff0c\u6bcf\u4e2a\u56de\u590d\u8fd8\u5e26\u6709\u70b9\u8d5e\u6570\u3001\u56de\u590dID\u3001\u56de\u590d\u8005\u7684\u6807\u7b7e\u3002\u8bad\u7ec3\u96c6\uff1a412\u4e07\uff1b\u9a8c\u8bc1\u96c6\uff1a6.8\u4e07\uff1b\u6d4b\u8bd5\u96c6\uff1a6.8\u4e07\u3002\u53ef\u7528\u4e8e\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>1\uff09\u6784\u5efa\u767e\u79d1\u7c7b\u95ee\u7b54\uff1a\u8f93\u5165\u4e00\u4e2a\u95ee\u9898\uff0c\u6784\u5efa\u68c0\u7d22\u7cfb\u7edf\u5f97\u5230\u4e00\u4e2a\u56de\u590d\u6216\u751f\u4ea7\u4e00\u4e2a\u56de\u590d\uff1b\u6216\u6839\u636e\u76f8\u5173\u5173\u952e\u8bcd\u4ece\uff0c\u793e\u533a\u95ee\u7b54\u5e93\u4e2d\u7b5b\u9009\u51fa\u4f60\u76f8\u5173\u7684\u9886\u57df\u6570\u636e<\/li>\n\n\n\n<li>2\uff09\u8bad\u7ec3\u8bdd\u9898\u9884\u6d4b\u6a21\u578b\uff1a\u8f93\u5165\u4e00\u4e2a\u95ee\u9898(\u548c\u6216\u63cf\u8ff0)\uff0c\u9884\u6d4b\u5c5e\u4e8e\u8bdd\u9898\u3002<\/li>\n\n\n\n<li>3\uff09\u8bad\u7ec3\u793e\u533a\u95ee\u7b54(cQA)\u7cfb\u7edf\uff1a\u9488\u5bf9\u4e00\u95ee\u591a\u7b54\u7684\u573a\u666f\uff0c\u8f93\u5165\u4e00\u4e2a\u95ee\u9898\uff0c\u627e\u5230\u6700\u76f8\u5173\u7684\u95ee\u9898\uff0c\u5728\u8fd9\u4e2a\u57fa\u7840\u4e0a\u57fa\u4e8e\u4e0d\u540c\u7b54\u6848\u56de\u590d\u7684\u8d28\u91cf\u3001 \u95ee\u9898\u4e0e\u7b54\u6848\u7684\u76f8\u5173\u6027\uff0c\u627e\u5230\u6700\u597d\u7684\u7b54\u6848\u3002<\/li>\n\n\n\n<li>4\uff09\u505a\u4e3a\u901a\u7528\u4e2d\u6587\u8bed\u6599\uff0c\u505a\u5927\u6a21\u578b\u9884\u8bad\u7ec3\u7684\u8bed\u6599\u6216\u8bad\u7ec3\u8bcd\u5411\u91cf\u3002\u5176\u4e2d\u7c7b\u522b\u4fe1\u606f\u4e5f\u6bd4\u8f83\u6709\u7528\uff0c\u53ef\u4ee5\u7528\u4e8e\u505a\u76d1\u7763\u8bad\u7ec3\uff0c\u4ece\u800c\u6784\u5efa\u66f4\u597d\u53e5\u5b50\u8868\u793a\u7684\u6a21\u578b\u3001\u53e5\u5b50\u76f8\u4f3c\u6027\u4efb\u52a1\u7b49\u3002<\/li>\n\n\n\n<li>5\uff09\u7ed3\u5408\u70b9\u8d5e\u6570\u91cf\u8fd9\u4e00\u989d\u5916\u4fe1\u606f\uff0c\u9884\u6d4b\u56de\u590d\u7684\u53d7\u6b22\u8fce\u7a0b\u5ea6\u6216\u8bad\u7ec3\u7b54\u6848\u8bc4\u5206\u7cfb\u7edf\u3002<\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\">6\u3001\u7ffb\u8bd1\u8bed\u6599<\/h4>\n\n\n\n<p class=\"has-small-font-size wp-block-paragraph\"><a href=\"https:\/\/drive.google.com\/open?id=1EX8eE5YWBxCaohBO8Fh4e2j3b9C2bTVQ\">https:\/\/drive.google.com\/open?id=1EX8eE5YWBxCaohBO8Fh4e2j3b9C2bTVQ<\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">translation2019zh\u4e2d\u82f1\u6587\u5e73\u884c\u8bed\u6599520\u4e07\u5bf9\u3002\u6bcf\u4e00\u4e2a\u5bf9\uff0c\u5305\u542b\u4e00\u4e2a\u82f1\u6587\u548c\u5bf9\u5e94\u7684\u4e2d\u6587\u3002\u4e2d\u6587\u6216\u82f1\u6587\uff0c\u591a\u6570\u60c5\u51b5\u662f\u4e00\u53e5\u5e26\u6807\u70b9\u7b26\u53f7\u7684\u5b8c\u6574\u7684\u8bdd\u3002\u5bf9\u4e8e\u4e00\u4e2a\u5e73\u884c\u7684\u4e2d\u82f1\u6587\u5bf9\uff0c\u4e2d\u6587\u5e73\u5747\u670936\u4e2a\u5b57\uff0c\u82f1\u6587\u5e73\u5747\u670919\u4e2a\u5355\u8bcd(\u5355\u8bcd\u5982\u201cshe\u201d)\u3002\u8bad\u7ec3\u96c6\uff1a516\u4e07\uff1b\u9a8c\u8bc1\u96c6\uff1a3.9\u4e07\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u53ef\u4ee5\u7528\u4e8e\u8bad\u7ec3\u4e2d\u82f1\u6587\u7ffb\u8bd1\u7cfb\u7edf\uff0c\u4ece\u4e2d\u6587\u7ffb\u8bd1\u5230\u82f1\u6587\uff0c\u6216\u4ece\u82f1\u6587\u7ffb\u8bd1\u5230\u4e2d\u6587\uff1b \u7531\u4e8e\u6709\u4e0a\u767e\u4e07\u7684\u4e2d\u6587\u53e5\u5b50\uff0c\u53ef\u4ee5\u53ea\u62bd\u53d6\u4e2d\u6587\u7684\u53e5\u5b50\uff0c\u505a\u4e3a\u901a\u7528\u4e2d\u6587\u8bed\u6599\uff0c\u8bad\u7ec3\u8bcd\u5411\u91cf\u6216\u505a\u4e3a\u9884\u8bad\u7ec3\u7684\u8bed\u6599\u3002\u82f1\u6587\u4efb\u52a1\u4e5f\u53ef\u4ee5\u7c7b\u4f3c\u64cd\u4f5c\u3002<\/p>\n\n\n\n<h1 class=\"wp-block-heading\">\u4e09\u3001\u6a21\u578b\u8bad\u7ec3<\/h1>\n\n\n\n<h4 class=\"wp-block-heading\">1\u3001\u5b89\u88c5\u4f9d\u8d56\uff1a<\/h4>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code># gpu driver\nsudo ubuntu-drivers autoinstall\nnvidia-smi\n# \u4f9d\u8d56\npip config set global.index-url https:\/\/pypi.tuna.tsinghua.edu.cn\/simple\npip install numpy\npip install transformers\npip install datasets\npip install tiktoken\npip install wandb\npip install tqdm\n# pytorch 1.13 \u9700\u8981\u5173\u95edtrain.py\u4e2d\u7684\u5f00\u5173 compile=False\npip install torch\n# pytorch 2.0 \u6a21\u578b\u52a0\u901f\u8981\u7528\u5230torch.compile()\uff0c\u53ea\u652f\u6301\u6bd4\u8f83\u65b0\u7684GPU\n# pip install --pre torch&#91;dynamo]  --force-reinstall --extra-index-url https:\/\/download.pytorch.org\/whl\/nightly\/cu117 --timeout 60000\n<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">2\u3001GPT\u7f51\u7edc\u6a21\u578b\u5b9a\u4e49<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">GPT \u8bed\u8a00\u6a21\u578b\u7684\u5b9a\u4e49\u53c2\u8003\uff1a<br>1\uff09OpenAI\u5b98\u65b9\u53d1\u5e03\u7684GPT-2 TensorFlow\u5b9e\u73b0\uff1a<br>https:\/\/github.com\/openai\/gpt-2\/blob\/master\/src\/model.py<br>2) huggingface\/transformers PyTorch \u5b9e\u73b0\uff1a<br>https:\/\/github.com\/huggingface\/transformers\/blob\/main\/src\/transformers\/models\/gpt2\/modeling_gpt2.py<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code>\"\"\"\nGPT \u8bed\u8a00\u6a21\u578b\u7684\u5b9a\u4e49\u53c2\u8003\uff1a\n1\uff09OpenAI\u5b98\u65b9\u53d1\u5e03\u7684GPT-2 TensorFlow \u5b9e\u73b0\uff1a\nhttps:&#47;&#47;github.com\/openai\/gpt-2\/blob\/master\/src\/model.py\n2) huggingface\/transformers PyTorch \u5b9e\u73b0:\nhttps:\/\/github.com\/huggingface\/transformers\/blob\/main\/src\/transformers\/models\/gpt2\/modeling_gpt2.py\n\"\"\"\n\nimport math\nimport inspect\nfrom dataclasses import dataclass\n\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\n\n# @torch.jit.script # \u4ec5\u5728\u4e0d\u4f7f\u7528torch.compile\u65f6\u542f\u7528\ndef new_gelu(x):\n    \"\"\"\n    Google BERT\u4e2d\u5b9e\u73b0\u7684GELU\u6fc0\u6d3b\u51fd\u6570\uff08\u4e0e OpenAI GPT \u76f8\u540c\uff09\u3002\u53c2\u8003\uff1a\u9ad8\u65af\u8bef\u5dee\u7ebf\u6027\u5355\u4f4d\uff08GELU\uff09\u8bba\u6587\uff1ahttps:\/\/arxiv.org\/abs\/1606.08415\n    \"\"\"\n    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 \/ math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))\n\nclass LayerNorm(nn.Module):\n    \"\"\"  \u6709\u4e00\u4e2a\u53ef\u9009\u504f\u5dee\u7684LayerNorm\u3002 PyTorch \u4e0d\u652f\u6301\u7b80\u5355\u7684 bias=False \"\"\"\n\n    def __init__(self, ndim, bias):\n        super().__init__()\n        self.weight = nn.Parameter(torch.ones(ndim))\n        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None\n\n    def forward(self, input):\n        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)\n\nclass CausalSelfAttention(nn.Module):\n\n    def __init__(self, config):\n        super().__init__()\n        assert config.n_embd % config.n_head == 0\n        # key, query, value projections for all heads, but in a batch\n        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)\n        # output projection\n        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)\n        # regularization\n        self.attn_dropout = nn.Dropout(config.dropout)\n        self.resid_dropout = nn.Dropout(config.dropout)\n        self.n_head = config.n_head\n        self.n_embd = config.n_embd\n        self.dropout = config.dropout\n        # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary\n        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and self.dropout == 0.0\n        if not self.flash:\n            print(\"WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0\")\n            # causal mask to ensure that attention is only applied to the left in the input sequence\n            self.register_buffer(\"bias\", torch.tril(torch.ones(config.block_size, config.block_size))\n                                        .view(1, 1, config.block_size, config.block_size))\n\n    def forward(self, x):\n        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)\n\n        # calculate query, key, values for all heads in batch and move head forward to be the batch dim\n        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)\n        k = k.view(B, T, self.n_head, C \/\/ self.n_head).transpose(1, 2) # (B, nh, T, hs)\n        q = q.view(B, T, self.n_head, C \/\/ self.n_head).transpose(1, 2) # (B, nh, T, hs)\n        v = v.view(B, T, self.n_head, C \/\/ self.n_head).transpose(1, 2) # (B, nh, T, hs)\n\n        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -&gt; (B, nh, T, T)\n        if self.flash:\n            # efficient attention using Flash Attention CUDA kernels\n            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True)\n        else:\n            # manual implementation of attention\n            att = (q @ k.transpose(-2, -1)) * (1.0 \/ math.sqrt(k.size(-1)))\n            att = att.masked_fill(self.bias&#91;:,:,:T,:T] == 0, float('-inf'))\n            att = F.softmax(att, dim=-1)\n            att = self.attn_dropout(att)\n            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -&gt; (B, nh, T, hs)\n        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side\n\n        # output projection\n        y = self.resid_dropout(self.c_proj(y))\n        return y\n\nclass MLP(nn.Module):\n\n    def __init__(self, config):\n        super().__init__()\n        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)\n        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)\n        self.dropout = nn.Dropout(config.dropout)\n\n    def forward(self, x):\n        x = self.c_fc(x)\n        x = new_gelu(x)\n        x = self.c_proj(x)\n        x = self.dropout(x)\n        return x\n\nclass Block(nn.Module):\n\n    def __init__(self, config):\n        super().__init__()\n        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)\n        self.attn = CausalSelfAttention(config)\n        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)\n        self.mlp = MLP(config)\n\n    def forward(self, x):\n        x = x + self.attn(self.ln_1(x))\n        x = x + self.mlp(self.ln_2(x))\n        return x\n\n@dataclass\nclass GPTConfig:\n    block_size: int = 1024\n    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency\n    n_layer: int = 12\n    n_head: int = 12\n    n_embd: int = 768\n    dropout: float = 0.0\n    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster\n\nclass GPT(nn.Module):\n\n    def __init__(self, config):\n        super().__init__()\n        assert config.vocab_size is not None\n        assert config.block_size is not None\n        self.config = config\n\n        self.transformer = nn.ModuleDict(dict(\n            wte = nn.Embedding(config.vocab_size, config.n_embd),\n            wpe = nn.Embedding(config.block_size, config.n_embd),\n            drop = nn.Dropout(config.dropout),\n            h = nn.ModuleList(&#91;Block(config) for _ in range(config.n_layer)]),\n            ln_f = LayerNorm(config.n_embd, bias=config.bias),\n        ))\n        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n        # with weight tying when using torch.compile() some warnings get generated:\n        # \"UserWarning: functional_call was passed multiple values for tied weights.\n        # This behavior is deprecated and will be an error in future versions\"\n        # not 100% sure what this is, so far seems to be harmless. TODO investigate\n        self.transformer.wte.weight = self.lm_head.weight # https:\/\/paperswithcode.com\/method\/weight-tying\n\n        # init all weights\n        self.apply(self._init_weights)\n        # apply special scaled init to the residual projections, per GPT-2 paper\n        for pn, p in self.named_parameters():\n            if pn.endswith('c_proj.weight'):\n                torch.nn.init.normal_(p, mean=0.0, std=0.02\/math.sqrt(2 * config.n_layer))\n\n        # report number of parameters\n        print(\"number of parameters: %.2fM\" % (self.get_num_params()\/1e6,))\n\n    def get_num_params(self, non_embedding=True):\n        \"\"\"\n        Return the number of parameters in the model.\n        For non-embedding count (default), the position embeddings get subtracted.\n        The token embeddings would too, except due to the parameter sharing these\n        params are actually used as weights in the final layer, so we include them.\n        \"\"\"\n        n_params = sum(p.numel() for p in self.parameters())\n        if non_embedding:\n            n_params -= self.transformer.wpe.weight.numel()\n        return n_params\n\n    def _init_weights(self, module):\n        if isinstance(module, nn.Linear):\n            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n            if module.bias is not None:\n                torch.nn.init.zeros_(module.bias)\n        elif isinstance(module, nn.Embedding):\n            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n\n    def forward(self, idx, targets=None):\n        device = idx.device\n        b, t = idx.size()\n        assert t &lt;= self.config.block_size, f\"Cannot forward sequence of length {t}, block size is only {self.config.block_size}\"\n        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)\n\n        # forward the GPT model itself\n        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)\n        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)\n        x = self.transformer.drop(tok_emb + pos_emb)\n        for block in self.transformer.h:\n            x = block(x)\n        x = self.transformer.ln_f(x)\n\n        if targets is not None:\n            # if we are given some desired targets also calculate the loss\n            logits = self.lm_head(x)\n            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)\n        else:\n            # inference-time mini-optimization: only forward the lm_head on the very last position\n            logits = self.lm_head(x&#91;:, &#91;-1], :]) # note: using list &#91;-1] to preserve the time dim\n            loss = None\n\n        return logits, loss\n\n    def crop_block_size(self, block_size):\n        # model surgery to decrease the block size if necessary\n        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)\n        # but want to use a smaller block size for some smaller, simpler model\n        assert block_size &lt;= self.config.block_size\n        self.config.block_size = block_size\n        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight&#91;:block_size])\n        for block in self.transformer.h:\n            block.attn.bias = block.attn.bias&#91;:,:,:block_size,:block_size]\n\n    @classmethod\n    def from_pretrained(cls, model_type, override_args=None):\n        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}\n        override_args = override_args or {} # default to empty dict\n        # only dropout can be overridden see more notes below\n        assert all(k == 'dropout' for k in override_args)\n        from transformers import GPT2LMHeadModel\n        print(\"loading weights from pretrained gpt: %s\" % model_type)\n\n        # n_layer, n_head and n_embd are determined from model_type\n        config_args = {\n            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params\n            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params\n            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params\n            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params\n        }&#91;model_type]\n        print(\"forcing vocab_size=50257, block_size=1024, bias=True\")\n        config_args&#91;'vocab_size'] = 50257 # always 50257 for GPT model checkpoints\n        config_args&#91;'block_size'] = 1024 # always 1024 for GPT model checkpoints\n        config_args&#91;'bias'] = True # always True for GPT model checkpoints\n        # we can override the dropout rate, if desired\n        if 'dropout' in override_args:\n            print(f\"overriding dropout rate to {override_args&#91;'dropout']}\")\n            config_args&#91;'dropout'] = override_args&#91;'dropout']\n        # create a from-scratch initialized minGPT model\n        config = GPTConfig(**config_args)\n        model = GPT(config)\n        sd = model.state_dict()\n        sd_keys = sd.keys()\n        sd_keys = &#91;k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask \/ buffer, not a param\n\n        # init a huggingface\/transformers model\n        model_hf = GPT2LMHeadModel.from_pretrained(model_type)\n        sd_hf = model_hf.state_dict()\n\n        # copy while ensuring all of the parameters are aligned and match in names and shapes\n        sd_keys_hf = sd_hf.keys()\n        sd_keys_hf = &#91;k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer\n        sd_keys_hf = &#91;k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)\n        transposed = &#91;'attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']\n        # basically the openai checkpoints use a \"Conv1D\" module, but we only want to use a vanilla Linear\n        # this means that we have to transpose these weights when we import them\n        assert len(sd_keys_hf) == len(sd_keys), f\"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}\"\n        for k in sd_keys_hf:\n            if any(k.endswith(w) for w in transposed):\n                # special treatment for the Conv1D weights we need to transpose\n                assert sd_hf&#91;k].shape&#91;::-1] == sd&#91;k].shape\n                with torch.no_grad():\n                    sd&#91;k].copy_(sd_hf&#91;k].t())\n            else:\n                # vanilla copy over the other parameters\n                assert sd_hf&#91;k].shape == sd&#91;k].shape\n                with torch.no_grad():\n                    sd&#91;k].copy_(sd_hf&#91;k])\n\n        return model\n\n    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):\n        \"\"\"\n        This long function is unfortunately doing something very simple and is being very defensive:\n        We are separating out all parameters of the model into two buckets: those that will experience\n        weight decay for regularization and those that won't (biases, and layernorm\/embedding weights).\n        We are then returning the PyTorch optimizer object.\n        \"\"\"\n\n        # separate out all parameters to those that will and won't experience regularizing weight decay\n        decay = set()\n        no_decay = set()\n        whitelist_weight_modules = (torch.nn.Linear, )\n        blacklist_weight_modules = (torch.nn.LayerNorm, LayerNorm, torch.nn.Embedding)\n        for mn, m in self.named_modules():\n            for pn, p in m.named_parameters():\n                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name\n                # random note: because named_modules and named_parameters are recursive\n                # we will see the same tensors p many many times. but doing it this way\n                # allows us to know which parent module any tensor p belongs to...\n                if pn.endswith('bias'):\n                    # all biases will not be decayed\n                    no_decay.add(fpn)\n                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):\n                    # weights of whitelist modules will be weight decayed\n                    decay.add(fpn)\n                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):\n                    # weights of blacklist modules will NOT be weight decayed\n                    no_decay.add(fpn)\n\n        # subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they\n        # will appear in the no_decay and decay sets respectively after the above.\n        # In addition, because named_parameters() doesn't return duplicates, it\n        # will only return the first occurence, key'd by 'transformer.wte.weight', below.\n        # so let's manually remove 'lm_head.weight' from decay set. This will include\n        # this tensor into optimization via transformer.wte.weight only, and not decayed.\n        decay.remove('lm_head.weight')\n\n        # validate that we considered every parameter\n        param_dict = {pn: p for pn, p in self.named_parameters()}\n        inter_params = decay &amp; no_decay\n        union_params = decay | no_decay\n        assert len(inter_params) == 0, \"parameters %s made it into both decay\/no_decay sets!\" % (str(inter_params), )\n        assert len(param_dict.keys() - union_params) == 0, \"parameters %s were not separated into either decay\/no_decay set!\" \\\n                                                    % (str(param_dict.keys() - union_params), )\n\n        # create the pytorch optimizer object\n        optim_groups = &#91;\n            {\"params\": &#91;param_dict&#91;pn] for pn in sorted(list(decay))], \"weight_decay\": weight_decay},\n            {\"params\": &#91;param_dict&#91;pn] for pn in sorted(list(no_decay))], \"weight_decay\": 0.0},\n        ]\n        # new PyTorch nightly has a new 'fused' option for AdamW that is much faster\n        use_fused = (device_type == 'cuda') and ('fused' in inspect.signature(torch.optim.AdamW).parameters)\n        print(f\"using fused AdamW: {use_fused}\")\n        extra_args = dict(fused=True) if use_fused else dict()\n        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)\n\n        return optimizer\n\n    def estimate_mfu(self, fwdbwd_per_iter, dt):\n        \"\"\" estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS \"\"\"\n        # first estimate the number of flops we do per iteration.\n        # see PaLM paper Appendix B as ref: https:\/\/arxiv.org\/abs\/2204.02311\n        N = self.get_num_params()\n        cfg = self.config\n        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd\/\/cfg.n_head, cfg.block_size\n        flops_per_token = 6*N + 12*L*H*Q*T\n        flops_per_fwdbwd = flops_per_token * T\n        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter\n        # express our flops throughput as ratio of A100 bfloat16 peak flops\n        flops_achieved = flops_per_iter * (1.0\/dt) # per second\n        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS\n        mfu = flops_achieved \/ flops_promised\n        return mfu\n\n    @torch.no_grad()\n    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):\n        \"\"\"\n        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete\n        the sequence max_new_tokens times, feeding the predictions back into the model each time.\n        Most likely you'll want to make sure to be in model.eval() mode of operation for this.\n        \"\"\"\n        for _ in range(max_new_tokens):\n            # if the sequence context is growing too long we must crop it at block_size\n            idx_cond = idx if idx.size(1) &lt;= self.config.block_size else idx&#91;:, -self.config.block_size:]\n            # forward the model to get the logits for the index in the sequence\n            logits, _ = self(idx_cond)\n            # pluck the logits at the final step and scale by desired temperature\n            logits = logits&#91;:, -1, :] \/ temperature\n            # optionally crop the logits to only the top k options\n            if top_k is not None:\n                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))\n                logits&#91;logits &lt; v&#91;:, &#91;-1]]] = -float('Inf')\n            # apply softmax to convert logits to (normalized) probabilities\n            probs = F.softmax(logits, dim=-1)\n            # sample from the distribution\n            idx_next = torch.multinomial(probs, num_samples=1)\n            # append sampled index to the running sequence and continue\n            idx = torch.cat((idx, idx_next), dim=1)\n\n        return idx\n<\/code><\/pre>\n\n\n\n<h4 class=\"wp-block-heading\">3\u3001\u7b80\u5355\u7684\u838e\u58eb\u6bd4\u4e9a\u4f5c\u54c1\u8bad\u7ec3<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">\u5982\u679c\u4f60\u4e0d\u662f\u6df1\u5ea6\u5b66\u4e60\u4e13\u4e1a\u4eba\u58eb\uff0c\u53ea\u662f\u60f3\u611f\u53d7\u4e00\u4e0b\u9b54\u529b\uff0c\u5c1d\u8bd5\u4e00\u4e0b\uff0c\u90a3\u4e48\u6700\u5feb\u7684\u5165\u95e8\u65b9\u5f0f\u5c31\u662f\u5728\u838e\u58eb\u6bd4\u4e9a\u7684\u4f5c\u54c1\u4e0a\u8bad\u7ec3\u4e00\u4e2a\u89d2\u8272\u7ea7\u522b\u7684 GPT\u3002\u9996\u5148\uff0c\u6211\u4eec\u5c06\u5176\u4e0b\u8f7d\u4e3a\u5355\u4e2a (1MB) \u6587\u4ef6\uff0c\u5e76\u5c06\u5176\u4ece\u539f\u59cb\u6587\u672c\u8f6c\u6362\u4e3a\u4e00\u4e2a\u5927\u7684\u6574\u6570\u6d41\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># \u62c9\u53d6\u838e\u58eb\u6bd4\u4e9a\u4f5c\u54c1\uff0c\u5e76\u5c06\u5b57\u7b26\u6620\u5c04\u4e3a\u6574\u6570\u6570\u636e\u96c6train.bin\/val.bin\u4ee5\u53ca\u7f16\u7801\u89e3\u7801\u5668\u6587\u4ef6meta.pkl(\u975eGPT-2\u7f16\u7801\u5668\u65f6\u624d\u6709)\npython data\/shakespeare_char\/prepare.py<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u63a5\u4e0b\u6765\u6211\u4eec\u8bad\u7ec3\u4e00\u4e2a\u521d\u7ea7\u7684GPT\u6a21\u578b&nbsp;\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code># \u8bad\u7ec3shakespeare \npython train.py config\/train_shakespeare_char.py\n\n# \u5b9e\u6d4bGTX1060 GPU\uff0c\u8bad\u7ec330\u5206\u949f\u540e\uff0closs\u53ef\u4ee5\u964d\u52301.42\u5de6\u53f3\nnumber of parameters: 10.65M\nusing fused AdamW: False\nstep 0: train loss 4.2874, val loss 4.2823\niter 0: loss 4.2675, time 124253.06ms, mfu -100.00%\niter 10: loss 3.2455, time 867.98ms, mfu 0.43%\n... 30min later\nsaving checkpoint to out-shakespeare-char\niter 750: loss 1.4245, time 120214.49ms, mfu 0.38%\niter 760: loss 1.4523, time 876.03ms, mfu 0.38%\niter 770: loss 1.4261, time 872.84ms, mfu 0.39%\n... 40min later\niter 980: loss 1.3589, time 869.70ms, mfu 0.42%\niter 990: loss 1.3402, time 878.26ms, mfu 0.42%\niter 1000: loss 1.3405, time 120470.64ms, mfu 0.38%\niter 1240: loss 1.2993, time 871.91ms, mfu 0.42%<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u5982\u679c\u6df1\u5165\u5176\u4e2d\uff0c\u4f60\u4f1a\u770b\u5230\u6211\u4eec\u6b63\u5728\u8bad\u7ec3\u4e00\u4e2a\u4e0a\u4e0b\u6587\u5927\u5c0f\u9ad8\u8fbe 256 \u4e2a\u5b57\u7b26\u3001384 \u4e2a\u7279\u5f81\u901a\u9053\u7684 GPT\uff0c\u5b83\u662f\u4e00\u4e2a 6 \u5c42 Transformer\uff0c\u6bcf\u5c42\u6709 6 \u4e2a\u5934\u3002\u5982\u679c\u662f\u5728A100 GPU \u4e0a\uff0c\u6b64\u8bad\u7ec3\u8fd0\u884c\u5927\u7ea6\u9700\u8981 3 \u5206\u949f\uff0c\u6700\u4f73loss\u4e3a 1.4697\uff0c\u800c\u5728GTX1060\u4e0a\uff0c\u8981\u8fbe\u5230\u76f8\u540closs\u9700\u8981\u5927\u7ea630\u5206\u949f\u3002\u6211\u4eec\u4e5f\u53ef\u4ee5\u901a\u8fc7\u8c03\u6574\u8bad\u7ec3\u53c2\u6570\u6765\u52a0\u5feb\u8bad\u7ec3\u901f\u5ea6\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code># \u66f4\u591a\u7684\u566a\u97f3\u4f46\u66f4\u5feb\u7684\u4f30\u8ba1eval_iters200-&gt;20\uff0c\u4e0a\u4e0b\u6587\u5927\u5c0f256-&gt;64\uff0c\u6bcf\u6b21\u8fed\u4ee3\u7684\u6279\u91cf\u6837\u4f8b\u5927\u5c0f64-&gt;12\uff0c\u66f4\u5c0f\u7684Transformer\uff084 \u5c42\u30014 \u4e2a\u5934\u3001128 \u5d4c\u5165\u5927\u5c0f\uff09\uff0c\u5e76\u5c06\u8fed\u4ee3\u6b21\u6570\u51cf\u5c11\u5230 2000\uff0c\u7b80\u5316\u6b63\u5219\u5316dropout=0.0\npython train.py config\/train_shakespeare_char.py -eval_iters=20 --log_interval=1 --block_size=64 --batch_size=12 --n_layer=4 --n_head=4 --n_embd=128 --max_iters=2000 --lr_decay_iters=2000 --dropout=0.0<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u6839\u636e\u914d\u7f6e\uff0c\u6a21\u578b\u68c0\u67e5\u70b9\u88ab\u5199\u5165<code>--out_dir<\/code>\u76ee\u5f55<code>out-shakespeare-char<\/code>\u3002\u56e0\u6b64\uff0c\u4e00\u65e6\u8bad\u7ec3\u5b8c\u6210\uff0c\u6211\u4eec\u5c31\u53ef\u4ee5\u901a\u8fc7\u5c06\u91c7\u6837\u811a\u672c\u6307\u5411\u6b64\u76ee\u5f55\u6765\u4ece\u6700\u4f73\u6a21\u578b\u4e2d\u91c7\u6837\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code># \u6587\u672c\u751f\u6210\npython sample.py --out_dir=out-shakespeare-char  --num_samples=2 <\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u8fd9\u4f1a\u751f\u6210\u4e00\u4e9b\u793a\u4f8b\uff0c\u4f8b\u5982\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code>STANLEY:\nHere did thy best encountreaty days and barneth;\nNurse, here's determine, away, my father,\nAnd I'll not speak thy heart mildness.\n\nCLARENCE:\nOne, my lord, in thy sovereign the house\nThat else no means that would by myself is the tyranny.\n\nHERMIONE:\nMy voices:\nGive me to kill you so do not in the maid,\nWhich I will not poor my husband's body breath! I am in his shade.\n\nRIVERS:\nIf it bring my own rage, being, you are advance.\n\nEDWARD:\nNight to make no much stoopy to the \n---------------\n\nMenenio, knees you shall have hang hateful good?\n\nTRANIO:\nWell, and shall be here not uncled, whom come some stopp'd;\nAnd my name of Marcius.\n\nLUCIO:\nAh, sir! the most statue's courteen some still.\n\nDUKE VIRGERY:\nMy brother gracious lord; I dare not good chance bear\nthe were but weak some in the sun of the wolf of this hearts\nOf subjects in the choose of the goey severeign\nShips the shopen eyes and approach ower.\n\nKING RICHARD II:\nMy lord, on sovereign; and make I took of thee,\nTo cold thee to e\n---------------\n<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u54c8\u54c8\uff0c\u5728 GPU \u4e0a\u8bad\u7ec3 40\u5206\u949f\u540e\uff0c\u5bf9\u4e8e\u5b57\u7b26\u7ea7\u6a21\u578b\u6765\u8bf4\u6548\u679c\u8fd8\u4e0d\u9519\u3002\u901a\u8fc7\u5728\u6b64\u6570\u636e\u96c6\u4e0a\u5fae\u8c03\u9884\u8bad\u7ec3\u7684 GPT-2 \u6a21\u578b\uff0c\u5f88\u53ef\u80fd\u4f1a\u83b7\u5f97\u66f4\u597d\u7684\u7ed3\u679c\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\">4\u3001\u590d\u73b0GPT-2<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">\u6df1\u5ea6\u5b66\u4e60\u4e13\u4e1a\u4eba\u58eb\u53ef\u80fd\u5bf9\u91cd\u73b0 GPT-2 \u7ed3\u679c\u66f4\u611f\u5174\u8da3\uff0c\u6211\u4eec\u5728\u8fd9\u91cc\u5c1d\u8bd5\u4e0b\u3002<code><\/code><\/p>\n\n\n\n<h6 class=\"wp-block-heading\">4.1 \u4ece\u96f6\u5f00\u59cb\u8bad\u7ec3<\/h6>\n\n\n\n<p class=\"wp-block-paragraph\">\u9996\u5148\u9700\u8981\u6807\u8bb0\u6570\u636e\u96c6\uff0c\u8fd9\u91cc\u6211\u4eec\u76f4\u63a5\u4f7f\u7528\u90e8\u5206\u5f00\u653e\u7684<a href=\"https:\/\/huggingface.co\/datasets\/openwebtext\">OpenWebText<\/a>\u6570\u636e\u96c6\uff0812.9G\u5927\u5c0f\uff0c\u662fOpenAI\u4f7f\u7528\u7684\u53ef\u516c\u5f00\u5b50\u96c6\uff09\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code># \u5c06 openwebtext \u6570\u636e\u96c6\u4e0b\u8f7d\u5e76\u5904\u7406\u4e3a\u4e8c\u8fdb\u5236\u6587\u4ef6\u7528\u4e8e\u8bad\u7ec3\uff08train.bin\/val.bin\uff0cGPT2 BPE TokenID\uff0cuint16\uff09\npython data\/openwebtext\/prepare.py<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u7136\u540e\u6211\u4eec\u51c6\u5907\u5f00\u59cb\u8bad\u7ec3\u3002\u8981\u91cd\u73b0 GPT-2 (124M)\uff0c\u60a8\u81f3\u5c11\u9700\u8981\u4e00\u4e2a 8X A100 40GB \u8282\u70b9\u5e76\u8fd0\u884c\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code># \u666e\u901a\u5355GPU\u4e0a\u8bad\u7ec3\npython train.py\n# A100\u4e0a\u8bad\u7ec3\uff08\u6839\u636eA100\u5361\u6570\u91cf\u8c03\u6574--nproc_per_node\u53c2\u6570\uff09\n# torchrun --standalone --nproc_per_node=8 train.py config\/train_gpt2.py<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u8fd9\u4e2a\u8bad\u7ec3\u8fc7\u7a0b\u771f\u662f\u6f2b\u957f\u554a\uff0c\u5728GTX1060\u4e0a\uff0c\u57fa\u672c\u8981\u8bad\u7ec3\u4e00\u5e74loss\u624d\u80fd\u4e0b\u964d\u52302.85\u5de6\u53f3\uff1b\u5982\u679c\u75288\u5f20A100 40G\u663e\u5b58\u5361\uff0c\u9884\u8ba1\u9700\u89814\u5929\u5de6\u53f3\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4eec\u8fd9\u4e9b\u7a77\u4eba\u603b\u4e0d\u80fd\u8981\u7b49\u4e0a\u4e00\u5e74\u534a\u8f7d\u624d\u80fd\u770b\u5230\u7ed3\u679c\u5427\uff1f\u4e0d\u8981\u6015\uff0cGPT\u662f\u9884\u8bad\u7ec3\u6a21\u578b\uff0c\u4e3a\u4e86\u52a0\u5feb\u901f\u5ea6\uff0c\u6211\u4eec\u53ef\u4ee5\u76f4\u63a5\u52a0\u8f7dGPT-2\u9884\u8bad\u7ec3\u6a21\u578b\uff0c\u518d\u7ee7\u7eed\u5582\u5165\u8bed\u6599\u8bad\u7ec3\u5373\u53ef\u3002<\/p>\n\n\n\n<h6 class=\"wp-block-heading\">4.2 \u4eceGPT-2\u9884\u8bad\u7ec3\u6a21\u578b\u5f00\u59cb\u8bad\u7ec3<\/h6>\n\n\n\n<p class=\"wp-block-paragraph\">\u9884\u8bad\u7ec3\u6a21\u578bloss\u60c5\u51b5\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image\"><a href=\"https:\/\/blog.yanjingang.com\/wp-content\/uploads\/2023\/02\/gpt2-model.png\"><img decoding=\"async\" src=\"https:\/\/blog.yanjingang.com\/wp-content\/uploads\/2023\/02\/gpt2-model.png\" alt=\"\" class=\"wp-image-7169\"\/><\/a><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">\u4eceGPT-2\u9884\u8bad\u7ec3\u6a21\u578b\u8fdb\u884c\u521d\u59cb\u5316\u5e76\u4ee5\u8f83\u5c0f\u7684\u5b66\u4e60\u7387\u8fdb\u884c\u8bad\u7ec3\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code># \u62c9\u53d6\u838e\u58eb\u6bd4\u4e9a\u4f5c\u54c1\uff0c\u4f7f\u7528OpenAI BPE\u5206\u8bcd\u5668\u751f\u6210train.bin\/val.bin\npython data\/shakespeare\/prepare.py\n# \u52a0\u8f7dGPT-2\u9884\u8bad\u7ec3\u6a21\u578b\u3001\u838e\u58eb\u6bd4\u4e9a\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\npython train.py config\/finetune_shakespeare.py <\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">4.3 \u901a\u8fc7GPT-2\u9884\u8bad\u7ec3\u6a21\u578b\u8fdb\u884c\u6587\u672c\u8f93\u51fa<\/h6>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code># \u52a0\u8f7dGPT-2\u9884\u8bad\u7ec3\u6a21\u578b\uff0c\u8fdb\u884c\u6587\u672c\u8f93\u51fa\npython sample.py  --init_from=gpt2   --start=\"\u751f\u547d\u3001\u5b87\u5b99\u548c\u4e00\u5207\u7684\u7b54\u6848\u662f\u4ec0\u4e48\uff1f\"  --num_samples=3 --max_new_tokens=1000<\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">4.4 \u901a\u8fc7\u672c\u5730\u6a21\u578b\u8fdb\u884c\u6587\u672c\u8f93\u51fa<\/h6>\n\n\n\n<p class=\"wp-block-paragraph\">4.1\/4.2\u6b65\u9aa4\u672c\u5730\u8fed\u4ee3\u8bad\u7ec3\u751f\u6210\u7684best checkpoint model\uff08\u6700\u4f4e\u9a8c\u8bc1\u635f\u5931\uff09\u5c06\u4fdd\u5b58\u5728<code>out_dir<\/code>\u76ee\u5f55\u4e2d\uff0c\u5982\u679c\u8981\u52a0\u8f7d\u7ee7\u7eed\u8bad\u7ec3\u540e\u7684\u6a21\u578b\u8fdb\u884c\u6587\u672c\u8f93\u51fa\u7684\u8bdd\uff0c\u53ef\u4ee5\u901a\u8fc7<code>--out_dir<\/code>\u53c2\u6570\u6307\u5b9a\u4f4d\u7f6e<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code># \u52a0\u8f7d\u672c\u5730\u6a21\u578b\uff0c\u4f20\u5165\u5b57\u7b26\u4e32\u8fdb\u884c\u6587\u672c\u8f93\u51fa\npython sample.py  --out_dir=out-shakespeare --start=\"hi\" --num_samples=3 --max_new_tokens=100\n# \u52a0\u8f7d\u672c\u5730\u6a21\u578b\uff0c\u4f20\u5165\u6587\u4ef6\u8fdb\u884c\u6587\u672c\u8f93\u51fa\npython sample.py  --out_dir=out-shakespeare --start=\"FILE:abc.txt\" --num_samples=3 --max_new_tokens=1000<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u6d4b\u8bd5\u6548\u679c\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code>An'al-liqal,\n\nDid you hear the coming of Allah's Apostle?\n\nImam of Abu Bakr,\nO 'Abdullah!\nO O `Abdullah!\nHad man said: 'O Allah!\n'O Allah!\nO O 'Abdullah!\nI am the Prophet of Allah!\nO O Allah!\n\n---------------\nhi we all went out to fight and I know there's not a soul to know more. All in the morning some of them went out at first, but some of them went on to the warhouse and some went out again. So I was on board the ship, and at night came to see how the weather was done. There was a wind, and some of the ships were on, some were on in the air, some were off, some were on the ground.\n\n---------------\nhi was not so far from it. They called us to him.\n\nSixty of them had been left with him to the bed.\n\nAnd, now, as the bed was far from the bed, he lay back, and looked at the morning stars.\n<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\">5\u3001\u4e2d\u6587\u6570\u636e\u96c6\u8bad\u7ec3<\/h3>\n\n\n\n<h6 class=\"wp-block-heading\">5.1 \u68c0\u67e5gpt-encoder\/decode\u7f16\u89e3\u7801\u5bf9\u4e2d\u6587\u7684\u5904\u7406\u662f\u5426\u6b63\u786e<\/h6>\n\n\n\n<pre class=\"wp-block-code has-small-font-size\"><code>data = 'AI \u65b0\u4eba\u7c7b:\u4ece\u8bde\u751f\u5230\u6210\u957f \u2014\u2014\u767e\u5ea6 AI \u793e\u4f1a\u4ef7\u503c\u62a5\u544a'\nenc = tiktoken.get_encoding(\"gpt2\")\n# encode\ntrain_ids = enc.encode_ordinary(train_data)\nprint('encode: ', train_ids)\n# decode\nres = enc.decode(train_ids)\nprint('decode: ', res)\n\n-----------------\ntrain_ids:  &#91;20185, 10545, 244, 108, 21689, 163, 109, 119, 25, 20015, 236, 46237, 252, 37955, 26344, 108, 22755, 238, 165, 243, 123, 851, 960, 163, 247, 122, 41753, 99, 9552, 13328, 97, 122, 27670, 248, 20015, 115, 161, 222, 120, 162, 232, 98, 37772, 232]\ndecode:  AI \u65b0\u4eba\u7c7b:\u4ece\u8bde\u751f\u5230\u6210\u957f \u2014\u2014\u767e\u5ea6 AI \u793e\u4f1a\u4ef7\u503c\u62a5\u544a<\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">5.2 \u51c6\u5907\u4e2d\u6587\u6570\u636e\u96c6\u53ca\u7f16\u89e3\u7801\u5904\u7406<\/h6>\n\n\n\n<p class=\"wp-block-paragraph\">\u5728data\u76ee\u5f55\u4e0b\u65b0\u5efa\u4e00\u4e2achinese\u76ee\u5f55\uff0c\u521b\u5efainput.txt\uff0c\u653e\u5165\u4e2d\u6587\u7d20\u6750\u3002\uff08\u53ef\u53c2\u8003shakespeare\uff09<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u521b\u5efaprepare.py\u5e76\u6267\u884c\uff0c\u5c06\u4e2d\u6587\u7d20\u6750\u5904\u7406\u4e3a\u7f16\u7801\u540e\u7684train.bin\uff08\u7d20\u6750\u524d90%\u8bad\u7ec3\u7528\uff09\u3001val.bin\uff08\u7d20\u6750\u540e10%\u9a8c\u8bc1\u7528\uff09\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>python data\/chinese\/prepare.py<\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">5.3 \u4e2d\u6587\u8bad\u7ec3<\/h6>\n\n\n\n<p class=\"wp-block-paragraph\">\u521b\u5efaconfig\/train_chinese.py\u8bad\u7ec3\u914d\u7f6e\uff0c\u8bbe\u7f6e\u597d\u7d20\u6750\u76ee\u5f55\u3001\u6a21\u578b\u8f93\u51fa\u76ee\u5f55\u3001\u8bad\u7ec3\u53c2\u6570\u7b49\uff0c\u5e76\u542f\u52a8\u8bad\u7ec3\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>python train.py config\/train_chinese.py<\/code><\/pre>\n\n\n\n<h6 class=\"wp-block-heading\">5.4 \u4e2d\u6587\u751f\u6210<\/h6>\n\n\n\n<pre class=\"wp-block-code\"><code># \u52a0\u8f7d\u672c\u5730\u6a21\u578b\uff0c\u4f20\u5165\u5b57\u7b26\u4e32\u8fdb\u884c\u6587\u672c\u8f93\u51fa \npython sample.py --out_dir=out-chinese --start=\"AI\u793e\u4f1a\u4ef7\u503c\" --num_samples=3 --max_new_tokens=100<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">\u6e90\u4ee3\u7801\u4f4d\u7f6e\uff1ahttps:\/\/github.com\/yanjingang\/nanoGPT<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">yan 2.24<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u53c2\u8003\uff1a<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/github.com\/yanjingang\/nanoGPT\" target=\"_blank\" rel=\"noreferrer noopener\">nanoGPT<\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/github.com\/brightmart\/nlp_chinese_corpus\" target=\"_blank\" rel=\"noreferrer noopener\">\u5927\u89c4\u6a21\u4e2d\u6587\u81ea\u7136\u8bed\u8a00\u5904\u7406\u8bed\u6599 Large Scale Chinese Corpus for NLP<\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/github.com\/GaoPeng97\/transformer-xl-chinese\" target=\"_blank\" rel=\"noreferrer noopener\">\u6597\u7834\u82cd\u7a79\u5c0f\u8bf4\u3001\u5510\u8bd7\u3001\u8bd7\u6b4c\u3001\u77e5\u4e4e\u8bed\u6599<\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/github.com\/lrs1353281004\/Chinese_medical_NLP\" target=\"_blank\" rel=\"noreferrer noopener\">\u533b\u7597NLP\u9886\u57df\uff08\u4e3b\u8981\u5173\u6ce8\u4e2d\u6587\uff09 \u8bc4\u6d4b\u6570\u636e\u96c6 \u4e0e \u8bba\u6587\u7b49\u76f8\u5173\u8d44\u6e90\u3002<\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/www.sohu.com\/a\/336262203_129720\" target=\"_blank\" rel=\"noreferrer noopener\">\u5b8c\u5168\u56fe\u89e3GPT-2<\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u539f\u6587\u94fe\u63a5\uff1a<a href=\"https:\/\/blog.yanjingang.com\/?p=7102\">\u5982\u4f55\u8bad\u7ec3\u4e00\u4e2a\u81ea\u5df1\u7684GPT\u6a21\u578b | \u95eb\u91d1\u94a2\u7684Blog (yanjingang.com)<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u4e00\u3001\u6982\u8ff0 1\u3001\u4ec0\u4e48\u662fGPT GPT\uff0c\u5168\u79f0\u201cGenerative Pre-training Transforme [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","theme-transparent-header-meta":"","adv-header-id-meta":"","stick-header-meta":"","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"default","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"_jetpack_memberships_contains_paid_content":false,"footnotes":""},"categories":[313,289,312,43],"tags":[242,314],"class_list":["post-1727","post","type-post","status-publish","format-standard","hentry","category-chatgpt","category-gpt","category-openai","category-infoarticle","tag-chatgpt","tag-openai-api"],"views":4961,"jetpack_sharing_enabled":true,"jetpack_featured_media_url":"","_links":{"self":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/1727","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1727"}],"version-history":[{"count":1,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/1727\/revisions"}],"predecessor-version":[{"id":1728,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/1727\/revisions\/1728"}],"wp:attachment":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1727"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1727"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1727"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}