{"id":4770,"date":"2024-09-18T22:33:49","date_gmt":"2024-09-18T14:33:49","guid":{"rendered":"https:\/\/www.aqwu.net\/wp\/?p=4770"},"modified":"2024-09-18T22:33:49","modified_gmt":"2024-09-18T14:33:49","slug":"%e5%8f%96%e6%b6%88%e4%bb%bb%e4%bd%95llm%e6%a8%a1%e5%9e%8b%e7%9a%84%e4%b8%a5%e6%a0%bc%e5%ae%a1%e6%9f%a5","status":"publish","type":"post","link":"https:\/\/www.aqwu.net\/wp\/?p=4770","title":{"rendered":"\u53d6\u6d88\u4efb\u4f55LLM\u6a21\u578b\u7684\u4e25\u683c\u5ba1\u67e5"},"content":{"rendered":"\n<p class=\"wp-block-paragraph\">\u7b2c\u4e09\u4ee3 Llama \u6a21\u578b\u63d0\u4f9b\u4e86\u5fae\u8c03 \uff08Instruct\uff09 \u7248\u672c\uff0c\u8fd9\u4e9b\u7248\u672c\u5728\u7406\u89e3\u548c\u9075\u5faa\u8bf4\u660e\u65b9\u9762\u8868\u73b0\u51fa\u8272\u3002\u7136\u800c\uff0c\u8fd9\u4e9b\u6a21\u578b\u53d7\u5230\u4e25\u683c\u5ba1\u67e5\uff0c\u65e8\u5728\u62d2\u7edd\u88ab\u89c6\u4e3a\u6709\u5bb3\u7684\u8bf7\u6c42\uff0c\u5176\u54cd\u5e94\u5305\u62ec\u201c\u4f5c\u4e3a AI \u52a9\u624b\uff0c\u6211\u65e0\u6cd5\u5e2e\u52a9\u4f60\u201d\u3002\u867d\u7136\u6b64\u5b89\u5168\u529f\u80fd\u5bf9\u4e8e\u9632\u6b62\u8bef\u7528\u81f3\u5173\u91cd\u8981\uff0c\u4f46\u5b83\u9650\u5236\u4e86\u6a21\u578b\u7684\u7075\u6d3b\u6027\u548c\u54cd\u5e94\u80fd\u529b\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u5728\u672c\u6587\u4e2d\uff0c\u6211\u4eec\u5c06\u63a2\u8ba8\u4e00\u79cd\u79f0\u4e3a \u201cabliteration\u201d \u7684\u6280\u672f\uff0c\u8be5\u6280\u672f\u53ef\u4ee5\u5728\u4e0d\u91cd\u65b0\u8bad\u7ec3\u7684\u60c5\u51b5\u4e0b\u53d6\u6d88\u5bf9\u4efb\u4f55 LLM \u7684\u5ba1\u67e5\u3002\u8fd9\u79cd\u6280\u672f\u6709\u6548\u5730\u6d88\u9664\u4e86\u6a21\u578b\u7684\u5185\u7f6e\u62d2\u7edd\u673a\u5236\uff0c\u4f7f\u5176\u80fd\u591f\u54cd\u5e94\u6240\u6709\u7c7b\u578b\u7684\u63d0\u793a\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u8be5\u4ee3\u7801\u53ef\u5728&nbsp;<a href=\"https:\/\/colab.research.google.com\/drive\/1VYm3hOcvCpbGiqKZb141gJwjdmmCcVpR?usp=sharing\">Google Colab<\/a>&nbsp;\u548c GitHub \u4e0a\u7684&nbsp;<a href=\"https:\/\/github.com\/mlabonne\/llm-course\">LLM \u8bfe\u7a0b<\/a>\u4e2d\u627e\u5230\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u2702\ufe0f \u4ec0\u4e48\u662f\u70e7\u8680(abliteration)\uff1f<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u73b0\u4ee3 LLM \u9488\u5bf9\u5b89\u5168\u6027\u548c\u6307\u4ee4\u9075\u5faa\u8fdb\u884c\u4e86\u5fae\u8c03\uff0c\u8fd9\u610f\u5473\u7740\u5b83\u4eec\u7ecf\u8fc7\u57f9\u8bad\u53ef\u4ee5\u62d2\u7edd\u6709\u5bb3\u8bf7\u6c42\u3002\u5728\u4ed6\u4eec\u7684<a href=\"https:\/\/www.lesswrong.com\/posts\/jGuXSZgv6qfdhMCuJ\/refusal-in-llms-is-mediated-by-a-single-direction\">\u535a\u5ba2\u6587\u7ae0<\/a>\u4e2d\uff0cArditi \u7b49\u4eba\u8868\u660e\uff0c\u8fd9\u79cd\u62d2\u7edd\u884c\u4e3a\u662f\u7531\u6a21\u578b\u6b8b\u5dee\u6d41\u4e2d\u7684\u7279\u5b9a\u65b9\u5411\u4ecb\u5bfc\u7684\u3002\u5982\u679c\u6211\u4eec\u963b\u6b62\u6a21\u578b\u8868\u793a\u8fd9\u4e2a\u65b9\u5411\uff0c\u5b83<strong>\u5c31\u4f1a\u5931\u53bb\u62d2\u7edd\u8bf7\u6c42\u7684\u80fd\u529b<\/strong>\u3002\u76f8\u53cd\uff0c\u4eba\u4e3a\u5730\u6dfb\u52a0\u6b64\u65b9\u5411\u53ef\u80fd\u4f1a\u5bfc\u81f4\u6a21\u578b\u62d2\u7edd\u751a\u81f3\u65e0\u5bb3\u7684\u8bf7\u6c42\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u5728\u4f20\u7edf\u7684\u4ec5\u9650\u89e3\u7801\u5668\u7684\u7c7b\u4f3c Llama \u7684\u67b6\u6784\u4e2d\uff0c\u6211\u4eec\u53ef\u4ee5\u9488\u5bf9\u4e09\u4e2a\u6b8b\u5dee\u6d41\uff1a\u5728\u6bcf\u4e2a\u5757\u7684\u5f00\u5934\uff08\u201cpre\u201d\uff09\u3001\u6ce8\u610f\u529b\u5c42\u548c MLP \u5c42\u4e4b\u95f4\uff08\u201cmid\u201d\uff09\u4ee5\u53ca MLP \u4e4b\u540e\uff08\u201cpost\u201d\uff09\u3002\u4e0b\u56fe\u8bf4\u660e\u4e86\u6bcf\u4e2a\u6b8b\u5dee\u6d41\u7684\u4f4d\u7f6e\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"392\" src=\"https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-21-1024x392.png\" alt=\"\" class=\"wp-image-4771\" srcset=\"https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-21-1024x392.png 1024w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-21-300x115.png 300w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-21-768x294.png 768w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-21-1536x587.png 1536w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-21-1320x505.png 1320w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-21-600x229.png 600w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-21.png 2037w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">\u8981\u53d6\u6d88 LLM \u7684\u5ba1\u67e5\uff0c\u6211\u4eec\u9996\u5148\u9700\u8981\u786e\u5b9a\u6a21\u578b\u4e2d\u7684 \u201c\u62d2\u7edd\u65b9\u5411(refusal direction)\u201d\u3002\u6b64\u8fc7\u7a0b\u6d89\u53ca\u51e0\u4e2a\u6280\u672f\u6b65\u9aa4\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>\u6570\u636e\u6536\u96c6<\/strong>\uff1a\u5728\u4e00\u7ec4\u6709\u5bb3\u6307\u4ee4\u548c\u4e00\u7ec4\u65e0\u5bb3\u6307\u4ee4\u4e0a\u8fd0\u884c\u6a21\u578b\uff0c\u8bb0\u5f55\u6bcf\u4e2a\u6307\u4ee4\u5728\u6700\u540e\u4e00\u4e2a\u6807\u8bb0\u4f4d\u7f6e\u7684\u6b8b\u5dee\u6d41\u6fc0\u6d3b\u3002<\/li>\n\n\n\n<li><strong>\u5747\u503c\u5dee<\/strong>\u503c\uff1a\u8ba1\u7b97\u6709\u5bb3\u548c\u65e0\u5bb3\u6307\u4ee4\u7684\u6fc0\u6d3b\u503c\u4e4b\u95f4\u7684\u5747\u503c\u5dee\u503c\u3002\u8fd9\u4e3a\u6211\u4eec\u63d0\u4f9b\u4e86\u4e00\u4e2a\u5411\u91cf\uff0c\u8868\u793a\u6a21\u578b\u6bcf\u4e00\u5c42\u7684 \u201c\u62d2\u7edd\u65b9\u5411(refusal direction)\u201d\u3002<\/li>\n\n\n\n<li><strong>\u9009\u62e9<\/strong>\uff1a\u5bf9\u8fd9\u4e9b\u5411\u91cf\u8fdb\u884c\u5f52\u4e00\u5316\u5e76\u8bc4\u4f30\u5b83\u4eec\u4ee5\u9009\u62e9\u5355\u4e2a\u6700\u4f73 \u201c\u62d2\u7edd\u65b9\u5411(refusal direction)\u201d\u3002<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">\u4e00\u65e6\u6211\u4eec\u786e\u5b9a\u4e86\u62d2\u7edd\u65b9\u5411\uff0c\u6211\u4eec\u5c31\u53ef\u4ee5 \u201c\u70e7\u8680(ablate)\u201d \u5b83\uff0c\u6709\u6548\u5730\u6d88\u9664\u4e86\u6a21\u578b\u8868\u793a\u8fd9\u4e00\u7279\u5f81\u7684\u80fd\u529b\u3002\u8fd9\u53ef\u4ee5\u901a\u8fc7<strong>\u63a8\u7406\u65f6\u5e72\u9884<\/strong>\u6216\u6c38\u4e45\u4f7f\u7528<strong>\u6743\u91cd\u6b63\u4ea4\u6765\u5b8c\u6210<\/strong>\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4eec\u5148\u8c08\u8c08\u63a8\u7406\u65f6\u5e72\u9884\u3002\u5bf9\u4e8e\u5199\u5165\u6b8b\u5dee\u6d41\u7684\u6bcf\u4e2a\u7ec4\u4ef6\uff08\u4f8b\u5982\u6ce8\u610f\u529b\u5934\uff09\uff0c\u6211\u4eec\u8ba1\u7b97\u5176\u8f93\u51fa\u5728 refu \u65b9\u5411\u4e0a\u7684\u6295\u5f71\u5e76\u51cf\u53bb\u6b64\u6295\u5f71\u3002\u8fd9\u79cd\u51cf\u6cd5\u5e94\u7528\u4e8e\u6bcf\u4e2a token \u548c\u6bcf\u4e00\u5c42\uff0c\u786e\u4fdd\u6a21\u578b\u6c38\u8fdc\u4e0d\u4f1a\u4ee3\u8868 refal \u65b9\u5411\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u53e6\u4e00\u65b9\u9762\uff0c\u6743\u91cd\u6b63\u4ea4\u5316\u6d89\u53ca\u76f4\u63a5\u4fee\u6539\u6a21\u578b\u6743\u91cd\u3002\u901a\u8fc7\u76f8\u5bf9\u4e8e refal \u65b9\u5411\u6b63\u4ea4\u5143\u4ef6\u6743\u91cd\uff0c\u5b83\u53ef\u4ee5\u5b8c\u5168\u9632\u6b62\u6a21\u578b\u5199\u5165\u8be5\u65b9\u5411\u3002\u8fd9\u662f\u901a\u8fc7\u8c03\u6574\u5199\u5165\u6b8b\u5dee\u6d41\u7684\u77e9\u9635\u6765\u5b9e\u73b0\u7684\uff0c\u786e\u4fdd\u5b83\u4eec\u4e0d\u4f1a\u5bf9 ref \u65b9\u5411\u4ea7\u751f\u5f71\u54cd\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u5728\u4e0b\u4e00\u8282\u4e2d\uff0c\u6211\u4eec\u5c06\u4f7f\u7528\u6743\u91cd\u6b63\u4ea4\u5316\u5b9e\u73b0\u5220\u5931\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\"><a href=\"https:\/\/huggingface.co\/blog\/mlabonne\/abliteration#%F0%9F%92%BB-implementation\"><\/a>\ud83d\udcbb \u5b9e\u73b0<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u4ee5\u4e0b\u5220\u9664\u5b9e\u73b0\u57fa\u4e8e&nbsp;<a href=\"https:\/\/huggingface.co\/failspy\/llama-3-70B-Instruct-abliterated\/blob\/main\/ortho_cookbook.ipynb\">FailSpy \u7684\u7b14\u8bb0\u672c<\/a>\uff0c\u800c\u7b14\u8bb0\u672c\u672c\u8eab\u4e5f\u57fa\u4e8e\u539f\u4f5c\u8005\u7684<a href=\"https:\/\/colab.research.google.com\/drive\/1a-aQvKC9avdZpdyBn4jgRQFObTPy1JZw?usp=sharing\">\u7b14\u8bb0\u672c<\/a>\u3002\u6211\u4e3b\u8981\u5bf9\u5176\u8fdb\u884c\u4e86\u8c03\u6574\u548c\u7b80\u5316\uff0c\u4f7f\u5176\u66f4\u6613\u4e8e\u7406\u89e3\u3002\u8fd9\u90e8\u5206\u4ee3\u7801\u5f88\u591a\uff0c\u6240\u4ee5\u4f60\u53ef\u4ee5\u770b\u5230\u53d1\u751f\u4e86\u4ec0\u4e48\uff0c\u4f46\u5982\u679c\u4f60\u5bf9\u6280\u672f\u7ec6\u8282\u4e0d\u592a\u611f\u5174\u8da3\uff0c\u4f60\u53ef\u4ee5\u4f7f\u7528 FailSpy \u7684&nbsp;<a href=\"https:\/\/github.com\/FailSpy\/abliterator\">abliterator \u5e93<\/a>\uff08\u8fd8\u53ef\u4ee5\u67e5\u770b\u4ed6\u5728 Hugging Face \u4e0a\u7684<a href=\"https:\/\/huggingface.co\/collections\/failspy\/abliterated-v3-664a8ad0db255eefa7d0012b\">\u5e9f\u9664\u6a21\u578b\u96c6\u5408<\/a>\uff09\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u8be5\u4ee3\u7801\u4f9d\u8d56\u4e8e\u51fa\u8272\u7684\u00a0<a href=\"https:\/\/github.com\/TransformerLensOrg\/TransformerLens\">TransformerLens<\/a>\u00a0\u5e93\uff08\u4ee5\u524d\u79f0\u4e3a EasyTransformer\uff09\u6765\u5b8c\u6210\u7e41\u91cd\u7684\u5de5\u4f5c\u3002\u5b83\u4e13\u4e3a\u673a\u5236\u53ef\u89e3\u91ca\u6027\u800c\u8bbe\u8ba1\uff0c\u6b64\u5904\u7528\u4e8e\u5e72\u9884\u6fc0\u6d3b\u3002\u611f\u8c22 Neel Nanda \u548c Joseph Bloom \u521b\u5efa\u548c\u7ef4\u62a4\u6b64\u5e93\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u9996\u5148\uff0c\u8ba9\u6211\u4eec\u5b89\u88c5\u5fc5\u8981\u7684\u5305\u5e76\u5bfc\u5165\u5b83\u4eec\u3002\u6240\u6709\u8fd9\u4e9b\u6b65\u9aa4\u90fd\u53ef\u4ee5\u5728\u6b64\u00a0<a href=\"https:\/\/colab.research.google.com\/drive\/1VYm3hOcvCpbGiqKZb141gJwjdmmCcVpR?usp=sharing\">Google Colab \u7b14\u8bb0\u672c<\/a>\u4e2d\u627e\u5230\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" >!pip install transformers transformers_stream_generator tiktoken transformer_lens einops jaxtyping\n\nimport torch\nimport functools\nimport einops\nimport gc\n\nfrom datasets import load_dataset\nfrom tqdm import tqdm\nfrom torch import Tensor\nfrom typing import List\nfrom transformer_lens import HookedTransformer, utils\nfrom transformer_lens.hook_points import HookPoint\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nfrom jaxtyping import Float, Int\nfrom collections import defaultdict\n\n# Turn automatic differentiation off to save GPU memory (credit: Undi95)\ntorch.set_grad_enabled(False)\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4eec\u9700\u8981\u4e24\u4e2a\u6570\u636e\u96c6\uff1a\u4e00\u4e2a\u5305\u542b\u65e0\u5bb3\u7684\u6307\u4ee4\uff0c\u53e6\u4e00\u4e2a\u5305\u542b\u6709\u5bb3\u7684\u6307\u4ee4\u3002\u6211\u4eec\u5c06\u4f7f\u7528&nbsp;<a href=\"https:\/\/huggingface.co\/datasets\/tatsu-lab\/alpaca\">tatsu-lab\/alpaca<\/a>&nbsp;\u4ee5\u53ca\u6765\u81ea&nbsp;<a href=\"https:\/\/github.com\/llm-attacks\/llm-attacks\">llm-attacks<\/a>&nbsp;\u7684\u6570\u636e\u3002\u4e3a\u4e86\u65b9\u4fbf\u8d77\u89c1\uff0c\u6211\u5c06\u5b83\u4eec\u91cd\u65b0\u6253\u5305\u5230\u4e24\u4e2a Hugging Face \u6570\u636e\u96c6\u4e2d\uff1a<a href=\"https:\/\/huggingface.co\/datasets\/mlabonne\/harmless_alpaca\">mlabonne\/harmless_alpaca<\/a>&nbsp;\u548c&nbsp;<a href=\"https:\/\/huggingface.co\/datasets\/mlabonne\/harmful_behaviors\">mlabonne\/harmful_behaviors<\/a>\u3002\u8fd9\u6837\uff0c\u60a8\u53ef\u4ee5\u8f7b\u677e\u5730\u5c06\u5b83\u4eec\u66ff\u6362\u4e3a\u60a8\u81ea\u5df1\u7684\u6570\u636e\u96c6\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4eec\u5c06\u52a0\u8f7d\u8bf4\u660e\u5e76\u5c06\u5b83\u4eec\u91cd\u65b0\u683c\u5f0f\u5316\u4e3a\u5e26\u6709 \u201crole\u201d \u548c \u201ccontent\u201d \u952e\u7684\u8bcd\u5178\u5217\u8868\u3002\u8fd9\u4f7f\u5f97\u5b83\u4e0e\u00a0<code>apply_chat_tokenizer\uff08\uff09<\/code>\u00a0\u65b9\u6cd5\u517c\u5bb9\uff0c\u6211\u4eec\u5c06\u4f7f\u7528\u5b83\u6765\u9075\u5faa Llama 3 \u7684\u804a\u5929\u6a21\u677f\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" >def reformat_texts(texts):\n    return [[{\"role\": \"user\", \"content\": text}] for text in texts]\n\n# Get harmful and harmless datasets\ndef get_harmful_instructions():\n    dataset = load_dataset('mlabonne\/harmful_behaviors')\n    return reformat_texts(dataset['train']['text']), reformat_texts(dataset['test']['text'])\n\ndef get_harmless_instructions():\n    dataset = load_dataset('mlabonne\/harmless_alpaca')\n    return reformat_texts(dataset['train']['text']), reformat_texts(dataset['test']['text'])\n\nharmful_inst_train, harmful_inst_test = get_harmful_instructions()\nharmless_inst_train, harmless_inst_test = get_harmless_instructions()\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u73b0\u5728\u6211\u4eec\u6709\u4e86\u6570\u636e\u96c6\uff0c\u6211\u4eec\u53ef\u4ee5\u52a0\u8f7d\u6211\u4eec\u60f3\u8981\u5220\u9664\u7684\u6a21\u578b\u3002\u4e0d\u5e78\u7684\u662f\uff0c\u4f60\u4e0d\u80fd\u76f4\u63a5\u4f7f\u7528\u00a0<code>HookedTransformer<\/code>\u00a0\u52a0\u8f7d\u81ea\u5b9a\u4e49\u6a21\u578b\u3002\u5728\u8fd9\u91cc\uff0c\u6211\u4f7f\u7528 FailSpy \u7684\u7b14\u8bb0\u672c\u4e2d\u63cf\u8ff0\u7684\u4e00\u4e2a\u6280\u5de7\u4e0b\u8f7d\u81ea\u5b9a\u4e49\u6a21\u578b\u5e76\u5c06\u5176\u91cd\u547d\u540d\u4e3a\u00a0<a href=\"https:\/\/huggingface.co\/meta-llama\/Meta-Llama-3-8B-Instruct\">meta-llama\/Meta-Llama-3-8B-Instruct<\/a>\u3002\u5982\u679c\u60a8\u7684 GPU \u4e0e BF16 \u4e0d\u517c\u5bb9\uff0c\u8bf7\u4ee5\u00a0<code>torch.float16<\/code>\u00a0\u683c\u5f0f\u52a0\u8f7d\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u5728\u6b64\u793a\u4f8b\u4e2d\uff0c\u6211\u4eec\u5c06\u4f7f\u7528\u00a0<a href=\"https:\/\/huggingface.co\/mlabonne\/Daredevil-8B\">mlabonne\/Daredevil-8B<\/a>\uff0c\u8fd9\u662f\u4f7f\u7528 DARE TIES \u521b\u5efa\u7684\u5927\u578b\u5408\u5e76\uff08\u8bf7\u53c2\u9605\u6211\u5173\u4e8e<a href=\"https:\/\/huggingface.co\/blog\/mlabonne\/merge-models\">\u6a21\u578b\u5408\u5e76<\/a>\u7684\u6587\u7ae0\uff09\uff0c\u5b83\u5728 8B \u7c7b\u522b\u7684 Open LLM \u6392\u884c\u699c\u4e0a\u5177\u6709\u6700\u9ad8\u7684 MMLU \u5206\u6570\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" >MODEL_ID = \"mlabonne\/Daredevil-8B\"\nMODEL_TYPE = \"meta-llama\/Meta-Llama-3-8B-Instruct\"\n\n# Download and load model\n!git clone https:\/\/huggingface.co\/{MODEL_ID} {MODEL_TYPE}\n\n# Load model and tokenizer\nmodel = HookedTransformer.from_pretrained_no_processing(\n    MODEL_TYPE,\n    local_files_only=True,\n    dtype=torch.bfloat16,\n    default_padding_side='left'\n)\ntokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)\ntokenizer.padding_side = 'left'\ntokenizer.pad_token = tokenizer.eos_token\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4eec\u73b0\u5728\u53ef\u4ee5\u5bf9\u6570\u636e\u96c6\u8fdb\u884c\u6807\u8bb0\u5316\u3002\u6211\u4eec\u5bf9\u65e0\u5bb3\u548c\u6709\u5bb3\u6307\u4ee4\u4f7f\u7528\u76f8\u540c\u6570\u91cf\u7684\u6837\u672c\u3002\u8bf7\u6ce8\u610f\uff0c\u5927\u91cf\u6837\u672c\u53ef\u4ee5\u4f7f\u7528\u6240\u6709 RAM\/VRAM\uff0c\u8fd9\u5c31\u662f\u4e3a\u4ec0\u4e48\u6211\u5728\u8fd9\u91cc\u5c06\u5176\u9650\u5236\u4e3a 256 \u7684\u539f\u56e0\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" >def tokenize_instructions(tokenizer, instructions):\n    return tokenizer.apply_chat_template(\n        instructions,\n        padding=True,\n        truncation=False,\n        return_tensors=\"pt\",\n        return_dict=True,\n        add_generation_prompt=True,\n    ).input_ids\n\nn_inst_train = min(256, len(harmful_inst_train), len(harmless_inst_train))\n\n# Tokenize datasets\nharmful_tokens = tokenize_instructions(\n    tokenizer,\n    instructions=harmful_inst_train[:n_inst_train],\n)\nharmless_tokens = tokenize_instructions(\n    tokenizer,\n    instructions=harmless_inst_train[:n_inst_train],\n)\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u4e00\u5207\u90fd\u8bbe\u7f6e\u597d\u4e86\uff0c\u6211\u4eec\u73b0\u5728\u53ef\u4ee5\u5b9e\u65bd abliteration \u7684\u7b2c\u4e00\u6b65\uff1a\u6570\u636e\u6536\u96c6\u3002\u6211\u4eec\u5e0c\u671b\u5904\u7406\u8fd9\u4e9b\u6807\u8bb0\u5316\u7684\u6570\u636e\u96c6\uff0c\u5e76\u5c06\u6b8b\u5dee\u6d41\u6fc0\u6d3b\u5b58\u50a8\u5728\u00a0<code>Harmful<\/code>\u00a0\u548c\u00a0<code>Harmless<\/code>\u00a0\u4e2d\u3002\u8fd9\u7531\u00a0<a href=\"https:\/\/github.com\/TransformerLensOrg\/TransformerLens\">transformer_lens<\/a>\u00a0\u5e93\u7ba1\u7406\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" ># Define batch size based on available VRAM\nbatch_size = 32\n\n# Initialize defaultdicts to store activations\nharmful = defaultdict(list)\nharmless = defaultdict(list)\n\n# Process the training data in batches\nnum_batches = (n_inst_train + batch_size - 1) \/\/ batch_size\nfor i in tqdm(range(num_batches)):\n    print(i)\n    start_idx = i * batch_size\n    end_idx = min(n_inst_train, start_idx + batch_size)\n\n    # Run models on harmful and harmless prompts, cache activations\n    harmful_logits, harmful_cache = model.run_with_cache(\n        harmful_tokens[start_idx:end_idx],\n        names_filter=lambda hook_name: 'resid' in hook_name,\n        device='cpu',\n        reset_hooks_end=True\n    )\n    harmless_logits, harmless_cache = model.run_with_cache(\n        harmless_tokens[start_idx:end_idx],\n        names_filter=lambda hook_name: 'resid' in hook_name,\n        device='cpu',\n        reset_hooks_end=True\n    )\n\n    # Collect and store the activations\n    for key in harmful_cache:\n        harmful[key].append(harmful_cache[key])\n        harmless[key].append(harmless_cache[key])\n\n    # Flush RAM and VRAM\n    del harmful_logits, harmless_logits, harmful_cache, harmless_cache\n    gc.collect()\n    torch.cuda.empty_cache()\n\n# Concatenate the cached activations\nharmful = {k: torch.cat(v) for k, v in harmful.items()}\nharmless = {k: torch.cat(v) for k, v in harmless.items()}\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u73b0\u5728\uff0c\u6211\u4eec\u53ef\u4ee5\u8ba1\u7b97\u6bcf\u4e00\u5c42\u7684 refal \u65b9\u5411\u3002\u8fd9\u5bf9\u5e94\u4e8e\u6709\u5bb3\u548c\u65e0\u5bb3\u6307\u4ee4\u7684\u6fc0\u6d3b\u4e4b\u95f4\u7684\u5e73\u5747\u5dee\uff0c\u7136\u540e\u5c06\u5176\u5f52\u4e00\u5316\u3002\u6211\u4eec\u6309<code>\u964d\u5e8f<\/code>\u5bf9\u5b83\u4eec\u8fdb\u884cactivation_scored\u6392\u5e8f\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" ># Helper function to get activation index\ndef get_act_idx(cache_dict, act_name, layer):\n    key = (act_name, layer)\n    return cache_dict[utils.get_act_name(*key)]\n\n# Compute difference of means between harmful and harmless activations at intermediate layers\nactivation_layers = [\"resid_pre\", \"resid_mid\", \"resid_post\"]\nactivation_refusals = defaultdict(list)\n\nfor layer_num in range(1, model.cfg.n_layers):\n    pos = -1  # Position index\n\n    for layer in activation_layers:\n        harmful_mean_act = get_act_idx(harmful, layer, layer_num)[:, pos, :].mean(dim=0)\n        harmless_mean_act = get_act_idx(harmless, layer, layer_num)[:, pos, :].mean(\n            dim=0\n        )\n\n        refusal_dir = harmful_mean_act - harmless_mean_act\n        refusal_dir = refusal_dir \/ refusal_dir.norm()\n        activation_refusals[layer].append(refusal_dir)\n\n# Get all calculated potential refusal directions, sort them in descending order based on their mean\n# Use a subset of layers if certain activations are not promising\nselected_layers = [\"resid_pre\"]\nactivation_scored = sorted(\n    [\n        activation_refusals[layer][l - 1]\n        for l in range(1, model.cfg.n_layers)\n        for layer in selected_layers\n    ],\n    key=lambda x: abs(x.mean()),\n    reverse=True,\n)\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u8be5\u8fc7\u7a0b\u7684\u6700\u540e\u4e00\u6b65\u5305\u62ec\u8bc4\u4f30\u6211\u4eec\u8ba1\u7b97\u7684\u62d2\u7edd\u65b9\u5411\u3002\u4e3a\u6b64\uff0c\u6211\u4eec\u5c06\u5728\u63a8\u7406\u8fc7\u7a0b\u4e2d\u5c06 refusal \u65b9\u5411\u5e94\u7528\u4e8e\u6bcf\u4e2a\u6b8b\u5dee\u6d41\u548c\u6bcf\u4e2a\u5757\u3002\u5728\u4e0b\u9762\u7684\u4ee3\u7801\u6bb5\u4e2d\uff0c\u6211\u4eec\u83b7\u5f97\u4e86 4 \u4e2a test harmful instructions \u548c 20 \u4e2a\u5757\uff08\u6216\u5c42\uff09\u7684\u4e16\u4ee3\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" >def _generate_with_hooks(\n    model: HookedTransformer,\n    tokenizer: AutoTokenizer,\n    tokens: Int[Tensor, \"batch_size seq_len\"],\n    max_tokens_generated: int = 64,\n    fwd_hooks=[],\n) -&gt; List[str]:\n    all_tokens = torch.zeros(\n        (tokens.shape[0], tokens.shape[1] + max_tokens_generated),\n        dtype=torch.long,\n        device=tokens.device,\n    )\n    all_tokens[:, : tokens.shape[1]] = tokens\n    for i in range(max_tokens_generated):\n        with model.hooks(fwd_hooks=fwd_hooks):\n            logits = model(all_tokens[:, : -max_tokens_generated + i])\n            next_tokens = logits[:, -1, :].argmax(\n                dim=-1\n            )  # greedy sampling (temperature=0)\n            all_tokens[:, -max_tokens_generated + i] = next_tokens\n    return tokenizer.batch_decode(\n        all_tokens[:, tokens.shape[1] :], skip_special_tokens=True\n    )\n\ndef get_generations(\n    model: HookedTransformer,\n    tokenizer: AutoTokenizer,\n    instructions: List[str],\n    fwd_hooks=[],\n    max_tokens_generated: int = 64,\n    batch_size: int = 4,\n) -&gt; List[str]:\n    generations = []\n    for i in tqdm(range(0, len(instructions), batch_size)):\n        tokens = tokenize_instructions(\n            tokenizer, instructions=instructions[i : i + batch_size]\n        )\n        generation = _generate_with_hooks(\n            model,\n            tokenizer,\n            tokens,\n            max_tokens_generated=max_tokens_generated,\n            fwd_hooks=fwd_hooks,\n        )\n        generations.extend(generation)\n    return generations\n\n# Inference-time intervention hook\ndef direction_ablation_hook(\n    activation: Float[Tensor, \"... d_act\"],\n    hook: HookPoint,\n    direction: Float[Tensor, \"d_act\"],\n):\n    if activation.device != direction.device:\n        direction = direction.to(activation.device)\n    proj = (\n        einops.einsum(\n            activation, direction.view(-1, 1), \"... d_act, d_act single -&gt; ... single\"\n        )\n        * direction\n    )\n    return activation - proj\n\n# Testing baseline\nN_INST_TEST = 4\nbaseline_generations = get_generations(\n    model, tokenizer, harmful_inst_test[:N_INST_TEST], fwd_hooks=[]\n)\n\n# Evaluating layers defined earlier (needs human evaluation to determine best layer for refusal inhibition)\nEVAL_N = 20  # Evaluate how many of the top N potential directions\nevals = []\nfor refusal_dir in tqdm(activation_scored[:EVAL_N]):\n    hook_fn = functools.partial(direction_ablation_hook, direction=refusal_dir)\n    fwd_hooks = [\n        (utils.get_act_name(act_name, layer), hook_fn)\n        for layer in list(range(model.cfg.n_layers))\n        for act_name in activation_layers\n    ]\n    intervention_generations = get_generations(\n        model, tokenizer, harmful_inst_test[:N_INST_TEST], fwd_hooks=fwd_hooks\n    )\n    evals.append(intervention_generations)\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4eec\u5c06\u6240\u6709\u4e16\u4ee3\u90fd\u5b58\u50a8\u5728\u00a0<code>evals<\/code>\u00a0\u5217\u8868\u4e2d\u3002\u6211\u4eec\u73b0\u5728\u53ef\u4ee5\u6253\u5370\u5b83\u4eec\u5e76\u624b\u52a8\u9009\u62e9\u4e3a\u6bcf\u4e2a\u6307\u4ee4\u63d0\u4f9b\u54cd\u5e94\u7684\u5c42 \uff08\u5757\uff09\u3002\u6211\u4f1a\u81ea\u52a8\u6392\u9664\u5305\u542b\u201cI cannot\u201d\u548c\u201cI can&#8217;t\u201d\u7684\u56de\u590d\uff0c\u4ee5\u7b5b\u9009\u6389\u4e0d\u9700\u8981\u7684\u7b54\u6848\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u5982\u679c\u627e\u4e0d\u5230\u6ee1\u8db3\u8fd9\u4e9b\u8981\u6c42\u7684\u5c42\uff0c\u5219\u53ef\u80fd\u9700\u8981\u6d4b\u8bd5\u524d\u9762\u7684\u00a0<code>selected_layers<\/code>\u00a0\u5217\u8868\u4e2d\u7684\u5176\u4ed6\u6b8b\u5dee\u6d41\u3001\u5176\u4ed6\u8bf4\u660e\u3001\u5176\u4ed6\u5757\u7b49\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" ># Print generations for human evaluation\nblacklist = [\"I cannot\", \"I can't\"]\nfor i in range(N_INST_TEST):\n    print(f\"\\033[1mINSTRUCTION {i}: {harmful_inst_test[i]}\")\n    print(f\"\\nBASELINE COMPLETION:\\n{baseline_generations[i]}\\033[0m\")\n    for layer_candidate in range(EVAL_N):\n        if not any(word in evals[layer_candidate][i] for word in blacklist):\n            print(f\"\\n---\\n\\nLAYER CANDIDATE #{layer_candidate} INTERVENTION COMPLETION:\")\n            print(evals[layer_candidate][i])\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u5c31\u6211\u800c\u8a00\uff0c\u5019\u9009\u56fe\u5c42 9 \u8bbe\u6cd5\u4e3a\u8fd9\u56db\u6761\u6307\u4ee4\u63d0\u4f9b\u4e86\u672a\u7ecf\u5ba1\u67e5\u7684\u7b54\u6848\u3002\u8fd9\u662f\u6211\u4eec\u5c06\u4e3a\u62d2\u7edd\u65b9\u5411\u9009\u62e9\u7684\u90a3\u4e2a\u3002\u5728\u4e0b\u6587\u4e2d\uff0c\u6211\u4eec\u5c06\u5b9e\u65bd\u6743\u91cd\u6b63\u4ea4\u5316\u6765\u4fee\u6539\u6743\u91cd\u5e76\u9632\u6b62\u6a21\u578b\u521b\u5efa\u5177\u6709\u6b64\u65b9\u5411\u7684\u8f93\u51fa\u3002\u60a8\u53ef\u4ee5\u901a\u8fc7\u6253\u5370\u5b8c\u6210\u6765\u9a8c\u8bc1\u6a21\u578b\u662f\u5426\u6210\u529f\u53d6\u6d88\u5220\u5931\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" >def get_orthogonalized_matrix(\n    matrix: Float[Tensor, \"... d_model\"], vec: Float[Tensor, \"d_model\"]\n) -&gt; Float[Tensor, \"... d_model\"]:\n    proj = (\n        einops.einsum(\n            matrix, vec.view(-1, 1), \"... d_model, d_model single -&gt; ... single\"\n        )\n        * vec\n    )\n    return matrix - proj\n\n# Select the layer with the highest potential refusal direction\nLAYER_CANDIDATE = 9\nrefusal_dir = activation_scored[LAYER_CANDIDATE]\n\n# Orthogonalize the model's weights\nif refusal_dir.device != model.W_E.device:\n    refusal_dir = refusal_dir.to(model.W_E.device)\nmodel.W_E.data = get_orthogonalized_matrix(model.W_E, refusal_dir)\n\nfor block in tqdm(model.blocks):\n    if refusal_dir.device != block.attn.W_O.device:\n        refusal_dir = refusal_dir.to(block.attn.W_O.device)\n    block.attn.W_O.data = get_orthogonalized_matrix(block.attn.W_O, refusal_dir)\n    block.mlp.W_out.data = get_orthogonalized_matrix(block.mlp.W_out, refusal_dir)\n\n# Generate text with abliterated model\northogonalized_generations = get_generations(\n    model, tokenizer, harmful_inst_test[:N_INST_TEST], fwd_hooks=[]\n)\n\n# Print generations\nfor i in range(N_INST_TEST):\n    if len(baseline_generations) &gt; i:\n        print(f\"INSTRUCTION {i}: {harmful_inst_test[i]}\")\n        print(f\"\\033[92mBASELINE COMPLETION:\\n{baseline_generations[i]}\")\n    print(f\"\\033[91mINTERVENTION COMPLETION:\\n{evals[LAYER_CANDIDATE][i]}\")\n    print(f\"\\033[95mORTHOGONALIZED COMPLETION:\\n{orthogonalized_generations[i]}\\n\")\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u73b0\u5728\uff0c\u6211\u4eec\u5df2\u51c6\u5907\u597d\u4f7f\u7528\u8be5\u6a21\u578b\u3002\u6211\u4eec\u5c06\u5176\u8f6c\u6362\u56de Hugging Face \u683c\u5f0f\u5e76\u5c06\u5176\u4e0a\u4f20\u5230 HF \u4e2d\u5fc3\u3002<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" ># Convert model back to HF safetensors\nhf_model = AutoModelForCausalLM.from_pretrained(MODEL_TYPE, torch_dtype=torch.bfloat16)\nlm_model = hf_model.model\n\nstate_dict = model.state_dict()\nlm_model.embed_tokens.weight = torch.nn.Parameter(state_dict[\"embed.W_E\"].cpu())\n\nfor l in range(model.cfg.n_layers):\n    lm_model.layers[l].self_attn.o_proj.weight = torch.nn.Parameter(\n        einops.rearrange(\n            state_dict[f\"blocks.{l}.attn.W_O\"], \"n h m-&gt;m (n h)\", n=model.cfg.n_heads\n        ).contiguous()\n    )\n    lm_model.layers[l].mlp.down_proj.weight = torch.nn.Parameter(\n        torch.transpose(state_dict[f\"blocks.{l}.mlp.W_out\"], 0, 1).contiguous()\n    )\n\nhf_model.push_to_hub(f\"{MODEL_ID}-abliterated\")\n# hf_model.push_to_hub(f\"{MODEL_ID}-abliterated\")\n<\/pre><\/div>\n\n\n\n<h2 class=\"wp-block-heading\"><a href=\"https:\/\/huggingface.co\/blog\/mlabonne\/abliteration#%E2%9A%96%EF%B8%8F-dpo-fine-tuning\"><\/a>\u2696\ufe0f DPO \u5fae\u8c03<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u5728 Open LLM \u6392\u884c\u699c\u548c Nous \u7684\u57fa\u51c6\u6d4b\u8bd5\u5957\u4ef6\u4e0a\u8bc4\u4f30\u4e86\u4e0a\u4e00\u8282\u4e2d\u7684\u5220\u51cf\u6a21\u578b\u548c\u6e90\u6a21\u578b\u3002\u7ed3\u679c\u5982\u4e0b\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"166\" src=\"https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-22-1024x166.png\" alt=\"\" class=\"wp-image-4772\" srcset=\"https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-22-1024x166.png 1024w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-22-300x49.png 300w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-22-768x125.png 768w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-22-600x97.png 600w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-22.png 1238w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">\u5982\u60a8\u6240\u89c1\uff0c\u6e90\u6a21\u578b\u7684\u6027\u80fd\u660e\u663e\u4f18\u4e8e Llama 3 8B Instruct\u3002\u4f46\u662f\uff0c\u6211\u4eec\u89c2\u5bdf\u5230\u5728\u6240\u6709\u57fa\u51c6\u6d4b\u8bd5\u4e2d\uff0c\u6d88\u878d\u7248\u672c\u7684\u6027\u80fd\u90fd\u6709\u6240\u4e0b\u964d\u3002\u6d88\u878d\u8fc7\u7a0b\u6210\u529f\u5730\u53d6\u6d88\u4e86\u5b83\uff0c\u4f46\u4e5f\u964d\u4f4e\u4e86\u6a21\u578b\u7684\u8d28\u91cf\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u4e00\u4e2a\u60f3\u6cd5\u5305\u62ec\u8fdb\u4e00\u6b65\u8bad\u7ec3\u6211\u4eec\u7684 ablited \u6a21\u578b\u6765\u6cbb\u6108\u5b83\u3002\u4e0e\u5927\u591a\u6570\u5fae\u8c03\u6a21\u578b\u4e00\u6837\uff0cLlama 3 8B Instruct \u5728\u76d1\u7763\u5fae\u8c03\u65b9\u9762\u975e\u5e38\u8106\u5f31\u3002\u989d\u5916\u7684 SFT \u53ef\u80fd\u4f1a\u7834\u574f\u6a21\u578b\u7684\u6027\u80fd\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u6216\u8005\uff0c\u504f\u597d\u5bf9\u9f50\u975e\u5e38\u8f7b\uff0c\u4e0d\u5e94\u8be5\u5207\u9664\u6211\u4eec\u7684\u8111\u53f6\u5207\u9664\u6a21\u578b\u3002DPO \u56e0\u5176\u6613\u7528\u6027\u548c\u826f\u597d\u7684\u8ddf\u8e2a\u8bb0\u5f55\u800c\u6210\u4e3a\u4e0d\u9519\u7684\u9009\u62e9\u3002\u4e3a\u4e86\u5b9e\u73b0\u5b83\uff0c\u6211\u5c06\u00a0<a href=\"https:\/\/colab.research.google.com\/drive\/1TsDKNo2riwVmU55gjuBgB1AXVtRRfRHW?usp=sharing\">LazyAxolotl<\/a>\u00a0\u4e0e\u00a0<a href=\"https:\/\/huggingface.co\/datasets\/mlabonne\/orpo-dpo-mix-40k\">mlabonne\/orpo-dpo-mix-40k<\/a>\u00a0\u6570\u636e\u96c6\u4e00\u8d77\u4f7f\u7528\u3002\u4ee5\u4e0b\u662f\u6211\u4f7f\u7528\u7684\u914d\u7f6e\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" >base_model: mlabonne\/Daredevil-8B-abliterated\nmodel_type: LlamaForCausalLM\ntokenizer_type: AutoTokenizer\n\nload_in_8bit: false\nload_in_4bit: true\nstrict: false\nsave_safetensors: true\n\nrl: dpo\nchat_template: chatml\ndatasets:\n  - path: mlabonne\/orpo-dpo-mix-40k-flat\n    split: train\n    type: chatml.intel\n\ndataset_prepared_path:\nval_set_size: 0.0\noutput_dir: .\/out\n\nadapter: qlora\nlora_model_dir:\n\nsequence_len: 2048\nsample_packing: false\npad_to_sequence_len: false\n\nlora_r: 64\nlora_alpha: 32\nlora_dropout: 0.05\nlora_target_linear: true\nlora_fan_in_fan_out:\n\nwandb_project: axolotl\nwandb_entity:\nwandb_watch:\nwandb_name:\nwandb_log_model:\n\ngradient_accumulation_steps: 8\nmicro_batch_size: 1\nnum_epochs: 1\noptimizer: paged_adamw_8bit\nlr_scheduler: cosine\nlearning_rate: 5e-6\ntrain_on_inputs: false\ngroup_by_length: false\n\nbf16: auto\nfp16:\ntf32:\n\ngradient_checkpointing: true\nearly_stopping_patience:\nresume_from_checkpoint:\nlocal_rank:\nlogging_steps: 1\nxformers_attention:\nflash_attention: true\nwarmup_steps: 100\nevals_per_epoch: 0\neval_table_size:\neval_table_max_new_tokens: 128\nsaves_per_epoch: 1\ndebug:\ndeepspeed: deepspeed_configs\/zero2.json\nweight_decay: 0.0\nspecial_tokens:\n  pad_token: &lt;|end_of_text|&gt;\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4f7f\u7528 6xA6000 GPU \u548c DeepSpeed ZeRO-2 \u5bf9\u5176\u8fdb\u884c\u4e86\u8bad\u7ec3\u3002\u57f9\u8bad\u5927\u7ea6\u9700\u8981 6 \u5c0f\u65f6 45 \u5206\u949f\u3002\u4ee5\u4e0b\u662f\u6211\u4ece W&amp;B \u5f97\u5230\u7684\u8bad\u7ec3\u66f2\u7ebf\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"599\" src=\"https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-23-1024x599.png\" alt=\"\" class=\"wp-image-4773\" srcset=\"https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-23-1024x599.png 1024w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-23-300x176.png 300w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-23-768x450.png 768w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-23-1536x899.png 1536w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-23-1320x773.png 1320w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-23-600x351.png 600w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-23.png 1864w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">\u5b83\u81ea\u52a8\u4e0a\u4f20\u4e86 DPO \u5fae\u8c03\u6a21\u578b\uff0c\u79f0\u4e3a&nbsp;<a href=\"https:\/\/huggingface.co\/mlabonne\/NeuralDaredevil-8B-abliterated\">mlabonne\/NeuralDaredevil-8B-abliterated<\/a>\u3002\u4e3a\u4e86\u770b\u770b\u5b83\u662f\u5426\u4fee\u590d\u4e86\u6211\u4eec\u7684 ablite \u7248\u672c\uff0c\u6211\u6839\u636e\u76f8\u540c\u7684\u57fa\u51c6\u5bf9\u5176\u8fdb\u884c\u4e86\u8bc4\u4f30\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"172\" src=\"https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-24-1024x172.png\" alt=\"\" class=\"wp-image-4774\" srcset=\"https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-24-1024x172.png 1024w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-24-300x50.png 300w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-24-768x129.png 768w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-24-600x101.png 600w, https:\/\/www.aqwu.net\/wp\/wp-content\/uploads\/2024\/09\/\u56fe\u7247-24.png 1238w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4eec\u53ef\u4ee5\u770b\u5230\uff0c\u8fd9\u79cd\u989d\u5916\u7684\u8bad\u7ec3\u4f7f\u6211\u4eec\u80fd\u591f\u6062\u590d\u7531\u4e8e\u70e7\u8680\u800c\u5bfc\u81f4\u7684\u5927\u90e8\u5206\u6027\u80fd\u4e0b\u964d\u3002\u8be5\u6a21\u578b\u6ca1\u6709\u6539\u8fdb\u7684\u4e00\u4e2a\u9886\u57df\u662f GSM8K\uff0c\u4e00\u4e2a\u6570\u5b66\u6570\u636e\u96c6\uff0c\u8fd9\u53ef\u80fd\u610f\u5473\u7740 orpo-dpo-mix-40k \u5c06\u4ece\u66f4\u591a\u7684\u6570\u5b66\u6837\u672c\u4e2d\u53d7\u76ca\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u6700\u7ec8\u6a21\u578b\u662f\u672a\u7ecf\u5ba1\u67e5\u7684 LLM\uff0c\u5728 8B \u7c7b\u522b\u4e2d\u5177\u6709\u6700\u5148\u8fdb\u7684\u6027\u80fd\u3002\u5f53\u60a8\u4e0d\u9700\u8981\u5ba1\u67e5\u65f6\uff0c\u6211\u63a8\u8350\u5b83\u4f5c\u4e3a Llama 3 8B Instruct \u7684\u6539\u8fdb\u7248\u672c\u3002\u60a8\u53ef\u4ee5\u5728 LM Studio \u4e2d\u4f7f\u7528 GGUF \u7b49\u91cf\u5316\u7248\u672c\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\"><a href=\"https:\/\/huggingface.co\/blog\/mlabonne\/abliteration#conclusion\"><\/a>\u7ed3\u8bba<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u5728\u672c\u6587\u4e2d\uff0c\u6211\u4eec\u4ecb\u7ecd\u4e86\u70e7\u8680\u7684\u6982\u5ff5\u3002\u8be5\u6280\u672f\u4f7f\u7528\u6a21\u578b\u5bf9\u65e0\u5bb3\u548c\u6709\u5bb3\u63d0\u793a\u7684\u6fc0\u6d3b\u6765\u8ba1\u7b97\u62d2\u7edd\u65b9\u5411\u3002\u7136\u540e\uff0c\u5b83\u4f7f\u7528\u8fd9\u4e2a\u65b9\u5411\u6765\u4fee\u6539\u6a21\u578b\u7684\u6743\u91cd\uff0c\u5e76\u786e\u4fdd\u6211\u4eec\u505c\u6b62\u8f93\u51fa refals\u3002\u8fd9\u79cd\u6280\u672f\u8fd8\u8bc1\u660e\u4e86\u5b89\u5168\u5fae\u8c03\u7684\u8106\u5f31\u6027\uff0c\u5e76\u63d0\u51fa\u4e86\u9053\u5fb7\u8003\u8651\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4eec\u5bf9 Daredevil-8B \u5e94\u7528\u4e86\u5220\u51cf\u4ee5\u53d6\u6d88\u5ba1\u67e5\uff0c\u8fd9\u4e5f\u964d\u4f4e\u4e86\u6a21\u578b\u7684\u6027\u80fd\u3002\u7136\u540e\uff0c\u6211\u4eec\u4f7f\u7528 DPO \u5bf9\u5176\u8fdb\u884c\u4fee\u590d\uff0c\u4ee5\u521b\u5efa NeuralDaredevil-8B \u6a21\u578b\uff0c\u8fd9\u662f\u4e00\u79cd\u5b8c\u5168\u672a\u7ecf\u5ba1\u67e5\u7684\u9ad8\u8d28\u91cf 8B LLM\u3002\u64e6\u9664\u4e0d\u4ec5\u9650\u4e8e\u5220\u9664\u5bf9\u9f50\uff0c\u5e94\u88ab\u89c6\u4e3a\u4e00\u79cd\u65e0\u9700\u91cd\u65b0\u8bad\u7ec3\u7684\u5fae\u8c03\u5f62\u5f0f\u3002\u4e8b\u5b9e\u4e0a\uff0c\u5b83\u53ef\u4ee5\u521b\u9020\u6027\u5730\u5e94\u7528\u4e8e\u5176\u4ed6\u76ee\u6807\uff0c\u4f8b\u5982 FailSpy \u7684&nbsp;<a href=\"https:\/\/huggingface.co\/failspy\/Llama-3-8B-Instruct-MopeyMule\">MopeyMule<\/a>\uff0c\u5b83\u91c7\u7528\u5fe7\u90c1\u7684\u5bf9\u8bdd\u98ce\u683c\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u5e0c\u671b\u4f60\u559c\u6b22\u8fd9\u7bc7\u6587\u7ae0\u3002\u5982\u679c\u4f60\u60f3\u770b\u5230\u66f4\u591a\uff0c\u8bf7\u5728&nbsp;<a href=\"https:\/\/huggingface.co\/mlabonne\/\">Hugging Face<\/a>&nbsp;\u548c Twitter&nbsp;<a href=\"https:\/\/twitter.com\/maximelabonne\">@maximelabonne<\/a>&nbsp;\u4e0a\u5173\u6ce8\u6211\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\"><a href=\"https:\/\/huggingface.co\/blog\/mlabonne\/abliteration#references\"><\/a>\u5f15\u7528<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>FailSpy\uff0c\u201c<a href=\"https:\/\/github.com\/FailSpy\/abliterator\">abliterator \u5e93<\/a>\u201d\uff0cGitHub\uff0c2024 \u5e74\u3002<\/li>\n\n\n\n<li>Andy Arditi\u3001Oscar Obeso\u3001Aaquib111\u3001wesg\u3001Neel Nanda\uff0c\u201c<a href=\"https:\/\/www.lesswrong.com\/posts\/jGuXSZgv6qfdhMCuJ\/refusal-in-llms-is-mediated-by-a-single-direction\">LLM \u4e2d\u7684\u62d2\u7edd\u7531\u5355\u4e00\u65b9\u5411\u8c03\u89e3<\/a>\u201d\uff0cLesswrong\uff0c2024 \u5e74\u3002<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">\u539f\u6587\u94fe\u63a5\uff1a<a href=\"https:\/\/huggingface.co\/blog\/mlabonne\/abliteration\">https:\/\/huggingface.co\/blog\/mlabonne\/abliteration<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u7b2c\u4e09\u4ee3 Llama \u6a21\u578b\u63d0\u4f9b\u4e86\u5fae\u8c03 \uff08Instruct\uff09 \u7248\u672c\uff0c\u8fd9\u4e9b\u7248\u672c\u5728\u7406\u89e3\u548c\u9075\u5faa\u8bf4\u660e\u65b9\u9762\u8868\u73b0\u51fa\u8272\u3002\u7136\u800c\uff0c\u8fd9 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","theme-transparent-header-meta":"","adv-header-id-meta":"","stick-header-meta":"","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"set","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"_jetpack_memberships_contains_paid_content":false,"footnotes":""},"categories":[444,445,443,442],"tags":[557,404],"class_list":["post-4770","post","type-post","status-publish","format-standard","hentry","category-ai","category-ainews","category-llm","category-llms","tag-abliteration","tag-llm"],"views":6561,"jetpack_sharing_enabled":true,"jetpack_featured_media_url":"","_links":{"self":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/4770","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=4770"}],"version-history":[{"count":1,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/4770\/revisions"}],"predecessor-version":[{"id":4775,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/4770\/revisions\/4775"}],"wp:attachment":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=4770"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=4770"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=4770"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}