{"id":2624,"date":"2024-03-29T18:51:16","date_gmt":"2024-03-29T10:51:16","guid":{"rendered":"https:\/\/www.aqwu.net\/wp\/?p=2624"},"modified":"2024-04-28T20:01:12","modified_gmt":"2024-04-28T12:01:12","slug":"%e4%ba%86%e8%a7%a3-jamba-%e6%a8%a1%e5%9e%8b","status":"publish","type":"post","link":"https:\/\/www.aqwu.net\/wp\/?p=2624","title":{"rendered":"\u4e86\u89e3 Jamba \u6a21\u578b"},"content":{"rendered":"\n<p class=\"wp-block-paragraph\">\u5bf9\u4e8e Transformers \u7cfb\u5217\u6a21\u578b\uff0c\u7531\u4e8e\u8fd9\u4e9b\u6a21\u578b\u5728\u81ea\u7136\u8bed\u8a00\u5904\u7406\uff08NLP\uff09\u9886\u57df\u5185\u975e\u5e38\u6d41\u884c\uff0c\u5f88\u591a\u6df1\u5ea6\u5b66\u4e60\u6846\u67b6\u90fd\u63d0\u4f9b\u4e86\u5bf9\u5b83\u4eec\u7684\u652f\u6301\u548c\u5b9e\u73b0\uff0c\u6bd4\u5982 TensorFlow \u7684 <code>transformers<\/code> \u5e93\u548c PyTorch \u7684 <code>transformers<\/code> \u5e93\uff08\u7531 Hugging Face \u63d0\u4f9b\uff09\u3002\u8fd9\u4e9b\u5e93\u4e2d\u7684\u6a21\u578b\u901a\u5e38\u5305\u62ec BERT\u3001GPT\u3001RoBERTa\u3001Transformer-XL\u3001T5 \u7b49\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u8fd9\u91cc\u4f7f\u7528 ai21labs\/Jamba-v0.1 \u7528\u4f5c\u6d4b\u8bd5\u6a21\u578b<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:sh decode:true \">https:\/\/huggingface.co\/ai21labs\/Jamba-v0.1\/\n<\/pre><\/div>\n\n\n\n<h2 class=\"wp-block-heading\"><strong>1. \u4e0b\u8f7d\u6a21\u578b\uff0c\u67e5\u770b\u6a21\u578b\u7684\u5927\u5c0f<\/strong><\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u6a21\u578b\u5927\u6982\u670996GB\u5927\u5c0f\uff0c\u5355\u4e2aGPU\u53ef\u80fd\u65e0\u6cd5\u52a0\u8f7d\uff0c\u9700\u8981\u4f7f\u7528\u591a\u4e2aGPU\u8fdb\u884c\u52a0\u8f7d<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:sh decode:true \">ls -l ai21labs\/Jamba-v0.1\/\ntotal 100728956\n-rwxrwxrwx 1 tony tony       1310 Mar 29 13:11 config.json\n-rwxrwxrwx 1 tony tony      11238 Mar 29 13:11 configuration_jamba.py\n-rwxrwxrwx 1 tony tony        137 Mar 29 13:11 generation_config.json\n-rwxrwxrwx 1 tony tony       1519 Mar 29 13:11 gitattributes\n-rwxrwxrwx 1 tony tony 4951236864 Mar 29 13:46 model-00001-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4884145024 Mar 29 13:48 model-00002-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4992294632 Mar 29 13:49 model-00003-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4958591040 Mar 29 13:50 model-00004-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4975501296 Mar 29 13:50 model-00005-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4884145016 Mar 29 14:01 model-00006-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4884144968 Mar 29 14:39 model-00007-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4992294696 Mar 29 14:43 model-00008-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4932506800 Mar 29 14:41 model-00009-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4884145056 Mar 29 14:48 model-00010-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4884145088 Mar 29 15:08 model-00011-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4884145088 Mar 29 15:09 model-00012-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4932506800 Mar 29 15:10 model-00013-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4992294648 Mar 29 15:11 model-00014-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4884145088 Mar 29 15:11 model-00015-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4884145088 Mar 29 15:32 model-00016-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4908260352 Mar 29 15:34 model-00017-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4908391496 Mar 29 15:34 model-00018-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4992294688 Mar 29 15:43 model-00019-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4884145088 Mar 29 15:43 model-00020-of-00021.safetensors\n-rwxrwxrwx 1 tony tony 4647318256 Mar 29 15:43 model-00021-of-00021.safetensors\n-rwxrwxrwx 1 tony tony     107400 Mar 29 13:11 model.safetensors.index.json\n-rwxrwxrwx 1 tony tony      99785 Mar 29 13:11 modeling_jamba.py\n-rwxrwxrwx 1 tony tony        121 Mar 29 13:11 special_tokens_map.json\n-rwxrwxrwx 1 tony tony    4242082 Mar 29 13:12 tokenizer.json\n-rwxrwxrwx 1 tony tony    1124742 Mar 29 13:12 tokenizer.model\n-rwxrwxrwx 1 tony tony       1109 Mar 29 13:12 tokenizer_config.json<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<h2 class=\"wp-block-heading\"><strong>2. \u4f7f\u7528 AutoConfig<\/strong> \u83b7\u53d6\u6a21\u578b\u53c2\u6570<\/h2>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \">from transformers import AutoConfig\n\n# \u52a0\u8f7d\u7279\u5b9a\u6a21\u578b\u7684\u914d\u7f6e\nconfig = AutoConfig.from_pretrained('ai21labs\/Jamba-v0.1', trust_remote_code=True)\n\n# \u663e\u793a\u914d\u7f6e\u4fe1\u606f\nprint(config)\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u8fd9\u4e2a\u65f6\u5019\u53ef\u4ee5\u653e\u5fc3\u8fd0\u884c\uff0c\u4e0d\u52a0\u8f7d\u6a21\u578b\uff0c\u53ea\u52a0\u8f7d\u6a21\u578b\u7684\u914d\u7f6e\u6587\u4ef6\uff0c\u8fd0\u884c\u8f93\u51fa\u7ed3\u679c<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:sh decode:true \">python test01.py\nJambaConfig {\n  \"_name_or_path\": \"ai21labs\/Jamba-v0.1\",\n  \"architectures\": [\n    \"JambaForCausalLM\"\n  ],\n  \"attention_dropout\": 0.0,\n  \"attn_layer_offset\": 4,\n  \"attn_layer_period\": 8,\n  \"auto_map\": {\n    \"AutoConfig\": \"configuration_jamba.JambaConfig\",\n    \"AutoModel\": \"modeling_jamba.JambaModel\",\n    \"AutoModelForCausalLM\": \"modeling_jamba.JambaForCausalLM\",\n    \"AutoModelForSequenceClassification\": \"model.JambaForSequenceClassification\"\n  },\n  \"bos_token_id\": 1,\n  \"calc_logits_for_entire_prompt\": false,\n  \"eos_token_id\": 2,\n  \"expert_layer_offset\": 1,\n  \"expert_layer_period\": 2,\n  \"hidden_act\": \"silu\",\n  \"hidden_size\": 4096,\n  \"initializer_range\": 0.02,\n  \"intermediate_size\": 14336,\n  \"mamba_conv_bias\": true,\n  \"mamba_d_conv\": 4,\n  \"mamba_d_state\": 16,\n  \"mamba_dt_rank\": 256,\n  \"mamba_expand\": 2,\n  \"mamba_inner_layernorms\": true,\n  \"mamba_proj_bias\": false,\n  \"model_type\": \"jamba\",\n  \"n_ctx\": 262144,\n  \"num_attention_heads\": 32,\n  \"num_experts\": 16,\n  \"num_experts_per_tok\": 2,\n  \"num_hidden_layers\": 32,\n  \"num_key_value_heads\": 8,\n  \"output_router_logits\": false,\n  \"pad_token_id\": 0,\n  \"rms_norm_eps\": 1e-06,\n  \"router_aux_loss_coef\": 0.001,\n  \"sliding_window\": null,\n  \"tie_word_embeddings\": false,\n  \"torch_dtype\": \"bfloat16\",\n  \"transformers_version\": \"4.39.2\",\n  \"use_cache\": true,\n  \"use_mamba_kernels\": true,\n  \"vocab_size\": 65536\n}<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u4ece\u7ed3\u679c\u6765\u770b\uff0c\u5176\u5b9e\u662f\u6a21\u578b\u76ee\u5f55\u4e0b\u7684 config.json \u6587\u4ef6\u91cc\u9762\u7684\u5185\u5bb9<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u4ece\u63d0\u4f9b\u7684\u8f93\u51fa\u6765\u770b\uff0c<code>test01.py<\/code> \u811a\u672c\u5f88\u53ef\u80fd\u662f\u8c03\u7528\u4e86 Hugging Face <code>transformers<\/code> \u5e93\u4e2d\u7684 <code>AutoConfig.from_pretrained<\/code> \u65b9\u6cd5\uff0c\u7528\u4e8e\u52a0\u8f7d\u4e00\u4e2a\u540d\u4e3a <code>ai21labs\/Jamba-v0.1<\/code> \u7684\u6a21\u578b\u914d\u7f6e\u3002\u8fd9\u4e2a\u914d\u7f6e\u5c5e\u4e8e <code>JambaConfig<\/code> \u7c7b\uff0c\u8fd9\u662f\u4e00\u4e2a\u5b9a\u5236\u5316\u7684\u6a21\u578b\u914d\u7f6e\uff0c\u53ef\u80fd\u662f\u4e3a <code>Jamba<\/code> \u6a21\u578b\u7279\u522b\u8bbe\u8ba1\u7684\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u8f93\u51fa\u4e2d\u5305\u542b\u4e86\u5927\u91cf\u7684\u914d\u7f6e\u4fe1\u606f\uff0c\u5176\u4e2d\u4e00\u4e9b\u5173\u952e\u5b57\u6bb5\u8bf4\u660e\u5982\u4e0b\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong><code>\"architectures\"<\/code><\/strong>: \u663e\u793a\u4e86\u6a21\u578b\u7684\u67b6\u6784\uff0c\u8fd9\u91cc\u662f <code>JambaForCausalLM<\/code>\uff0c\u8868\u660e\u8fd9\u4e2a\u914d\u7f6e\u662f\u4e3a\u56e0\u679c\u8bed\u8a00\u6a21\u578b\u8bbe\u8ba1\u7684\u3002<\/li>\n\n\n\n<li><strong><code>\"hidden_size\"<\/code><\/strong>: \u9690\u85cf\u5c42\u7684\u5927\u5c0f\u662f 4096\u3002<\/li>\n\n\n\n<li><strong><code>\"num_hidden_layers\"<\/code><\/strong>: \u8868\u660e\u6a21\u578b\u6709 32 \u5c42\u9690\u85cf\u5c42\u3002<\/li>\n\n\n\n<li><strong><code>\"num_attention_heads\"<\/code><\/strong>: \u6a21\u578b\u4f7f\u7528\u4e86 32 \u4e2a\u6ce8\u610f\u529b\u5934\u3002<\/li>\n\n\n\n<li><strong><code>\"vocab_size\"<\/code><\/strong>: \u8bcd\u6c47\u8868\u7684\u5927\u5c0f\u662f 65536\u3002<\/li>\n\n\n\n<li><strong><code>\"transformers_version\"<\/code><\/strong>: \u6307\u660e\u4e86 <code>transformers<\/code> \u5e93\u7684\u7248\u672c\u662f <code>4.39.2<\/code>\u3002<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">\u6b64\u5916\uff0c\u8fd8\u5305\u542b\u4e86\u4e00\u4e9b\u9488\u5bf9 <code>Jamba<\/code> \u6a21\u578b\u7279\u6709\u7684\u914d\u7f6e\u9879\uff0c\u5982 <code>mamba_conv_bias<\/code>\u3001<code>mamba_d_conv<\/code>\u3001<code>mamba_d_state<\/code> \u7b49\uff0c\u8fd9\u4e9b\u914d\u7f6e\u9879\u53ef\u80fd\u4e0e\u6a21\u578b\u4e2d\u7279\u6b8a\u7684\u5377\u79ef\u6216\u72b6\u6001\u5904\u7406\u673a\u5236\u6709\u5173\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><code><strong>num_hidden_layers<\/strong><\/code> \u6307\u7684\u662f\u8be5\u6a21\u578b\u67b6\u6784\u4e2d\u7684\u9690\u85cf\u5c42\u603b\u6570\u3002\u8fd9\u4e2a\u53c2\u6570\u5c31\u4ee3\u8868\u4e86\u6a21\u578b\u4e2d\u76f8\u5e94\u90e8\u5206\u7684\u5c42\u6570\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u4e86\u89e3\u8fd9\u4e2a\u53c2\u6570 <code><strong>num_hidden_layers<\/strong><\/code> \u5bf9\u540e\u9762\u7684\u6a21\u578b\u52a0\u8f7d\u6709\u7528\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\"><strong>3. \u4f7f\u7528 CPU \u52a0\u8f7d\u6a21\u578b<\/strong><\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u5f53\u4f60\u7684 GPU \u5185\u5b58\u4e0d\u591f\u4f7f\u7528\u7684\u65f6\u5019\uff0c\u4f7f\u7528 CPU \u52a0\u8f7d\u6a21\u578b\uff0c\u5982\u679c\u5185\u5b58\u4e0d\u591f(\u6b64\u6a21\u578b\u5927\u6982\u9700\u8981 212GB)\uff0c\u53ef\u80fd\u9700\u8981\u589e\u5927\u4ea4\u6362\u533a\uff08linux),\u53ef\u4ee5\u83b7\u53d6\u6a21\u578b\u53c2\u6570<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \">from transformers import AutoModelForCausalLM\nimport torch\n\nMODEL_NAME = \"ai21labs\/Jamba-v0.1\"\ndevice = torch.device(\"cpu\")\nmodel = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True).to(device)\nprint(model)\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u8fd0\u884c\u7ed3\u679c\u5982\u4e0b\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \">python test02.py\nLoading checkpoint shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 21\/21 [27:07&lt;00:00, 77.52s\/it]\nJambaForCausalLM(\n  (model): JambaModel(\n    (embed_tokens): Embedding(65536, 4096, padding_idx=0)\n    (layers): ModuleList(\n      (0): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (1): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (2): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (3): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (4): JambaAttentionDecoderLayer(\n        (self_attn): JambaSdpaAttention(\n          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (5): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (6): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (7): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (8): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (9): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (10): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (11): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (12): JambaAttentionDecoderLayer(\n        (self_attn): JambaSdpaAttention(\n          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (13): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (14): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (15): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (16): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (17): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (18): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (19): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (20): JambaAttentionDecoderLayer(\n        (self_attn): JambaSdpaAttention(\n          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (21): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (22): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (23): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (24): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (25): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (26): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (27): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (28): JambaAttentionDecoderLayer(\n        (self_attn): JambaSdpaAttention(\n          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (29): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (30): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (experts): ModuleList(\n            (0): JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n      (31): JambaMambaDecoderLayer(\n        (mamba): JambaMambaMixer(\n          (conv1d): Conv1d(8192, 8192, kernel_size=(4,), stride=(1,), padding=(3,), groups=8192)\n          (act): SiLU()\n          (in_proj): Linear(in_features=4096, out_features=16384, bias=False)\n          (x_proj): Linear(in_features=8192, out_features=288, bias=False)\n          (dt_proj): Linear(in_features=256, out_features=8192, bias=True)\n          (out_proj): Linear(in_features=8192, out_features=4096, bias=False)\n          (dt_layernorm): JambaRMSNorm()\n          (B_layernorm): JambaRMSNorm()\n          (C_layernorm): JambaRMSNorm()\n        )\n        (moe): JambaSparseMoeBlock(\n          (router): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x JambaMLP(\n              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): JambaRMSNorm()\n        (pre_moe_layernorm): JambaRMSNorm()\n      )\n    )\n    (final_layernorm): JambaRMSNorm()\n  )\n  (lm_head): Linear(in_features=4096, out_features=65536, bias=False)\n)<\/pre><\/div>\n\n\n\n<h2 class=\"wp-block-heading\"><strong>4. \u4f7f\u7528\u591a\u4e2a GPU \u52a0\u8f7d\u6a21\u578b<\/strong><\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u5f53\u4f60\u7684\u5355\u4e2a GPU \u5185\u5b58\u4e0d\u591f\u4f7f\u7528\u7684\u65f6\u5019\uff0c\u53ef\u80fd\u9700\u8981\u591a\u4e2aGPU\uff0c\u8fd9\u91cc\u6211\u4eec\u4f7f\u7528\u4e86 8 \u4e2a RTX 4090 \u7684\u5361, \u5b9e\u9645\u4e0a\u5bf9\u4e8e\u8fd9\u4e2a\u6a21\u578b\uff0c\u4e0d\u9700\u8981\u8fd9\u4e48\u591a\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u4eceCPU\u7684\u53c2\u6570\u6765\u770b\uff0c\u53ef\u4ee5\u5f97\u77e5\uff0c\u6709\u5982\u4e0b\u6743\u91cd\u53c2\u6570\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>model.embed_tokens<\/li>\n\n\n\n<li>model.norm<\/li>\n\n\n\n<li>model.final_layernorm<\/li>\n\n\n\n<li>lm_head<\/li>\n\n\n\n<li>model.layers (\u603b\u517132\u5c42)<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">\u6211\u4eec\u53ef\u4ee5\u5c06Transformer\u6a21\u578b\u7684\u6bcf\u4e00\u5c42\u5206\u914d\u7ed9\u4e00\u4e2aGPU\uff0c\u6bcf\u4e2aGPU\u53ef\u80fd\u662f\u591a\u5c42<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \">from transformers import AutoTokenizer, AutoModelForCausalLM\nimport torch\n\n# \u5b9a\u4e49\u4e00\u4e2a\u51fd\u6570\u6765\u81ea\u52a8\u914d\u7f6e\u5728\u591aGPU\u73af\u5883\u4e0b\u6a21\u578b\u5404\u90e8\u5206\u7684\u8bbe\u5907\u5206\u5e03\ndef auto_configure_device_map(num_gpus: int):\n    num_trans_layers = 32  # \u5b9a\u4e49Transformer\u6a21\u578b\u7684\u5c42\u6570\n    per_gpu_layers = num_trans_layers \/ num_gpus  # \u8ba1\u7b97\u6bcf\u4e2aGPU\u5e94\u627f\u62c5\u7684\u5c42\u6570\n    # \u521d\u59cb\u5316\u8bbe\u5907\u6620\u5c04\u5b57\u5178\uff0c\u6307\u5b9a\u4e00\u4e9b\u7279\u5b9a\u6a21\u5757\u5e94\u8be5\u653e\u7f6e\u7684GPU\u7f16\u53f7\n    device_map = {\n        'model.embed_tokens': 0,  # \u5d4c\u5165\u5c42\u653e\u5728\u7b2c\u4e00\u4e2aGPU\u4e0a\n        'model.norm': num_gpus-1,  # \u6700\u540e\u4e00\u4e2a\u6b63\u5219\u5316\u5c42\u653e\u5728\u6700\u540e\u4e00\u4e2aGPU\u4e0a\n        'lm_head': num_gpus-1,  # \u8bed\u8a00\u6a21\u578b\u5934\uff08\u7528\u4e8e\u9884\u6d4b\u4e0b\u4e00\u4e2a\u8bcd\u7684\u5c42\uff09\u653e\u5728\u6700\u540e\u4e00\u4e2aGPU\u4e0a\n        'model.final_layernorm': num_gpus-1  # \u6700\u540e\u4e00\u4e2aLayerNorm\u5c42\u4e5f\u653e\u5728\u6700\u540e\u4e00\u4e2aGPU\u4e0a\n    }\n    # \u5c06Transformer\u6a21\u578b\u7684\u6bcf\u4e00\u5c42\u5206\u914d\u7ed9\u4e00\u4e2aGPU\n    for i in range(num_trans_layers):\n        device_map[f'model.layers.{i}'] = int(i\/\/per_gpu_layers)\n    return device_map\n\nMODEL_NAME = \"ai21labs\/Jamba-v0.1\"\n\n# \u68c0\u6d4b\u53ef\u7528\u7684GPU\u6570\u91cf\nNUM_GPUS = torch.cuda.device_count()\n\nMAX_TOKENS = 512  # \u5b9a\u4e49\u6700\u5927\u4ee4\u724c\u6570\n\n# \u5982\u679c\u6709\u53ef\u7528\u7684GPU\uff0c\u5219\u57fa\u4e8eGPU\u6570\u91cf\u81ea\u52a8\u914d\u7f6e\u8bbe\u5907\u6620\u5c04\uff1b\u5426\u5219\u4e0d\u4f7f\u7528\u8bbe\u5907\u6620\u5c04\ndevice_map = auto_configure_device_map(NUM_GPUS) if NUM_GPUS &gt; 0 else None\n\n# \u5982\u679c\u6709\u53ef\u7528\u7684GPU\uff0c\u5219\u4f7f\u7528\u7b2c\u4e00\u4e2aGPU\uff1b\u5426\u5219\u4f7f\u7528CPU\ndevice = torch.device(\"cuda\") if NUM_GPUS &gt; 0 else torch.device(\"cpu\")\n\n# \u6839\u636e\u662f\u5426\u4f7f\u7528GPU\u8bbe\u7f6e\u6570\u636e\u7c7b\u578b\uff08\u534a\u7cbe\u5ea6\u6216\u5168\u7cbe\u5ea6\uff09\ndevice_dtype = torch.half if NUM_GPUS &gt; 0 else torch.float\n\n# \u52a0\u8f7d\u5206\u8bcd\u5668\u548c\u6a21\u578b\uff0c\u6307\u5b9a\u8bbe\u5907\u6620\u5c04\u548c\u6570\u636e\u7c7b\u578b\ntokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\nmodel = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map=device_map, torch_dtype=device_dtype)\n\n# \u5c06\u6a21\u578b\u8bbe\u7f6e\u4e3a\u8bc4\u4f30\u6a21\u5f0f\nmodel = model.eval()\n\n# \u5bf9\u8f93\u5165\u6587\u672c\u8fdb\u884c\u5206\u8bcd\uff0c\u5e76\u5c06\u5206\u8bcd\u7ed3\u679c\u79fb\u5230\u6a21\u578b\u6240\u5728\u8bbe\u5907\ninput_ids = tokenizer(\"In the recent Super Bowl LVIII,\", return_tensors='pt').to(device)[\"input_ids\"]\n\n# \u4f7f\u7528\u6a21\u578b\u751f\u6210\u6587\u672c\noutputs = model.generate(input_ids, max_new_tokens=MAX_TOKENS)\n\n# \u6253\u5370\u751f\u6210\u7684\u6587\u672c\nprint(tokenizer.batch_decode(outputs))\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u8f93\u51fa\u7684\u7ed3\u679c\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:batch decode:true \">python test03.py\nLoading checkpoint shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 21\/21 [27:41&lt;00:00, 79.11s\/it]\nThe `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.\n[\"&lt;|startoftext|&gt;In the recent Super Bowl LVIII, the Kansas City Chiefs emerged victorious, defeating the San Francisco 49ers in a thrilling overtime showdown. The game was a nail-biter, with both teams showcasing their skills and determination.\nThe Chiefs, led by their star quarterback Patrick Mahomes, displayed their offensive prowess, while the 49ers, led by their defense, put up a strong fight. The game went into overtime, with the Chiefs ultimately securing the win with a touchdown.\nThe victory marked the Chiefs' second Super Bowl win in four years, solidifying their status as one of the top teams in the NFL. The game was a testament to the skill and determination of both teams, and it will be remembered as one of the most exciting Super Bowls in recent history.\nThe Kansas City Chiefs' victory in Super Bowl LVIII was a thrilling and historic moment for the team and its fans. The game was a testament to the skill and determination of both teams, and it will be remembered as one of the most exciting Super Bowls in recent history.\nThe Chiefs' victory was a result of their offensive prowess, led by their star quarterback Patrick Mahomes, and their ability to come back from a 10-point deficit in the fourth quarter. The 49ers, led by their defense, put up a strong fight, but ultimately fell short in overtime.\nThe game was a showcase of the best of the NFL, with both teams displaying their skills and determination. The Chiefs' victory was a culmination of their hard work and dedication throughout the season, and it will be remembered as a historic moment in the team's history.\nThe game was a testament to the skill and determination of both teams, and it will be remembered as one of the most exciting Super Bowls in recent history. The Chiefs' victory was a result of their offensive prowess, led by their star quarterback Patrick Mahomes, and their ability to come back from a 10-point deficit in the fourth quarter.\nThe 49ers, led by their defense, put up a strong fight, but ultimately fell short in overtime. The game was a showcase of the best of the NFL, with both teams displaying their skills and determination.\nThe Chiefs' victory was a culmination of their hard work and dedication throughout the season, and it will be remembered as a historic moment in the team's history.\nThe game was a testament to the skill and determination of both teams, and it will be remembered\"]<\/pre><\/div>\n\n\n\n<h2 class=\"wp-block-heading\"><strong>5. \u591a GPU int8 \u548c int4 \u91cf\u5316\u90e8\u7f72<\/strong><\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u91cf\u5316\u540e\u7684 GPU \u5185\u5b58,\u9700\u8981\u7684\u66f4\u5c11\u3002\u4f46\u52a0\u8f7d\u7684\u65f6\u95f4\u4f1a\u66f4\u957f\uff0c\u4e0b\u9762\u7684\u4ee3\u7801\u662fint4\u7684<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \">from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\nimport torch\n\n# \u5b9a\u4e49\u4e00\u4e2a\u51fd\u6570\u6765\u81ea\u52a8\u914d\u7f6e\u5728\u591aGPU\u73af\u5883\u4e0b\u6a21\u578b\u5404\u90e8\u5206\u7684\u8bbe\u5907\u5206\u5e03\ndef auto_configure_device_map(num_gpus: int):\n    num_trans_layers = 32  # \u5b9a\u4e49Transformer\u6a21\u578b\u7684\u5c42\u6570\n    per_gpu_layers = num_trans_layers \/ num_gpus  # \u8ba1\u7b97\u6bcf\u4e2aGPU\u5e94\u627f\u62c5\u7684\u5c42\u6570\n    # \u521d\u59cb\u5316\u8bbe\u5907\u6620\u5c04\u5b57\u5178\uff0c\u6307\u5b9a\u4e00\u4e9b\u7279\u5b9a\u6a21\u5757\u5e94\u8be5\u653e\u7f6e\u7684GPU\u7f16\u53f7\n    device_map = {\n        'model.embed_tokens': 0,  # \u5d4c\u5165\u5c42\u653e\u5728\u7b2c\u4e00\u4e2aGPU\u4e0a\n        'model.norm': num_gpus-1,  # \u6700\u540e\u4e00\u4e2a\u6b63\u5219\u5316\u5c42\u653e\u5728\u6700\u540e\u4e00\u4e2aGPU\u4e0a\n        'lm_head': num_gpus-1,  # \u8bed\u8a00\u6a21\u578b\u5934\uff08\u7528\u4e8e\u9884\u6d4b\u4e0b\u4e00\u4e2a\u8bcd\u7684\u5c42\uff09\u653e\u5728\u6700\u540e\u4e00\u4e2aGPU\u4e0a\n        'model.final_layernorm': num_gpus-1  # \u6700\u540e\u4e00\u4e2aLayerNorm\u5c42\u4e5f\u653e\u5728\u6700\u540e\u4e00\u4e2aGPU\u4e0a\n    }\n    # \u5c06Transformer\u6a21\u578b\u7684\u6bcf\u4e00\u5c42\u5206\u914d\u7ed9\u4e00\u4e2aGPU\n    for i in range(num_trans_layers):\n        device_map[f'model.layers.{i}'] = int(i\/\/per_gpu_layers)\n    return device_map\n\nMODEL_NAME = \"ai21labs\/Jamba-v0.1\"\n\n# \u68c0\u6d4b\u53ef\u7528\u7684GPU\u6570\u91cf\nNUM_GPUS = torch.cuda.device_count()\n\nMAX_TOKENS = 512  # \u5b9a\u4e49\u6700\u5927\u4ee4\u724c\u6570\n\n# \u5982\u679c\u6709\u53ef\u7528\u7684GPU\uff0c\u5219\u57fa\u4e8eGPU\u6570\u91cf\u81ea\u52a8\u914d\u7f6e\u8bbe\u5907\u6620\u5c04\uff1b\u5426\u5219\u4e0d\u4f7f\u7528\u8bbe\u5907\u6620\u5c04\ndevice_map = auto_configure_device_map(NUM_GPUS) if NUM_GPUS &gt; 0 else None\n\n# \u5982\u679c\u6709\u53ef\u7528\u7684GPU\uff0c\u5219\u4f7f\u7528\u7b2c\u4e00\u4e2aGPU\uff1b\u5426\u5219\u4f7f\u7528CPU\ndevice = torch.device(\"cuda\") if NUM_GPUS &gt; 0 else torch.device(\"cpu\")\n\n# \u6839\u636e\u662f\u5426\u4f7f\u7528GPU\u8bbe\u7f6e\u6570\u636e\u7c7b\u578b\uff08\u534a\u7cbe\u5ea6\u6216\u5168\u7cbe\u5ea6\uff09\ndevice_dtype = torch.float16 if NUM_GPUS &gt; 0 else torch.float\n\n# \u52a0\u8f7d\u5206\u8bcd\u5668\u548c\u6a21\u578b\uff0c\u6307\u5b9a\u8bbe\u5907\u6620\u5c04\u548c\u6570\u636e\u7c7b\u578b\ntokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n\nquantization_config = BitsAndBytesConfig(load_in_4bit=True,\n                                         llm_int4_skip_modules=[\"mamba\"])\nmodel = AutoModelForCausalLM.from_pretrained(MODEL_NAME,\n                                             trust_remote_code=True,\n                                             torch_dtype=torch.bfloat16,\n                                             attn_implementation=\"flash_attention_2\",\n                                             quantization_config=quantization_config,\n                                             device_map=device_map)\n                                             \n#model = dispatch_model(model, device_map=device_map)\n\n# \u5c06\u6a21\u578b\u8bbe\u7f6e\u4e3a\u8bc4\u4f30\u6a21\u5f0f\nmodel = model.eval()\n\n# \u5bf9\u8f93\u5165\u6587\u672c\u8fdb\u884c\u5206\u8bcd\uff0c\u5e76\u5c06\u5206\u8bcd\u7ed3\u679c\u79fb\u5230\u6a21\u578b\u6240\u5728\u8bbe\u5907\ninput_ids = tokenizer(\"In the recent Super Bowl LVIII,\", return_tensors='pt').to(device)[\"input_ids\"]\n\n# \u4f7f\u7528\u6a21\u578b\u751f\u6210\u6587\u672c\noutputs = model.generate(input_ids, max_new_tokens=MAX_TOKENS)\n\n# \u6253\u5370\u751f\u6210\u7684\u6587\u672c\nprint(tokenizer.batch_decode(outputs))\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u5982\u679c\u9700\u8981int8 \u7684\uff0c\u53ea\u9700\u8981\u4fee\u6539\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \">quantization_config = BitsAndBytesConfig(load_in_8bit=True,\n                                         llm_int8_skip_modules=[\"mamba\"])\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u8fd0\u884c\u7ed3\u679c\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:sh decode:true \" > python test06.py\nLoading checkpoint shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 21\/21 [1:25:22&lt;00:00, 243.91s\/it]\nThe `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.\n[\"&lt;|startoftext|&gt;In the recent Super Bowl LVIII, the Kansas City Chiefs emerged victorious, defeating the San Francisco 49ers 25-22 in overtime. The game, which took place at Allegiant Stadium in Las Vegas, Nevada, was a thrilling spectacle for football fans around the world.\nThe Chiefs, led by their star quarterback Patrick Mahomes, showcased their resilience and determination throughout the game. Mahomes, who was named the Super Bowl MVP for the second time in his career, delivered an outstanding performance, throwing for 333 yards and two touchdowns.\nThe 49ers, on the other hand, put up a valiant effort but fell short in the end. Quarterback Brock Purdy, who had a breakout season, struggled to find his rhythm against the Chiefs' formidable defense. Despite a strong start, the 49ers were unable to maintain their momentum and ultimately fell short of their goal.\nThe game was not without its controversies, however. One of the most talked-about moments was a controversial call by the referees in the fourth quarter, which many believed cost the 49ers a crucial touchdown. The call sparked outrage among fans and pundits alike, with many questioning the integrity of the officiating.\nDespite the controversies, the Super Bowl LVIII was a memorable event for football fans. The halftime show, featuring performances by Usher and Alicia Keys, was a highlight of the evening, with fans praising the energy and excitement of the show.\nOverall, Super Bowl LVIII was a thrilling and unforgettable event, showcasing the best of American football and entertainment. The Kansas City Chiefs' victory will go down in history as one of the greatest moments in football history, and the controversies surrounding the game will continue to be debated for years to come.&lt;|endoftext|&gt;\"]<\/pre><\/div>\n","protected":false},"excerpt":{"rendered":"<p>\u5bf9\u4e8e Transformers \u7cfb\u5217\u6a21\u578b\uff0c\u7531\u4e8e\u8fd9\u4e9b\u6a21\u578b\u5728\u81ea\u7136\u8bed\u8a00\u5904\u7406\uff08NLP\uff09\u9886\u57df\u5185\u975e\u5e38\u6d41\u884c\uff0c\u5f88\u591a\u6df1\u5ea6\u5b66\u4e60\u6846\u67b6 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","theme-transparent-header-meta":"","adv-header-id-meta":"","stick-header-meta":"","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"set","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"_jetpack_memberships_contains_paid_content":false,"footnotes":""},"categories":[443,442,43],"tags":[242,399,314],"class_list":["post-2624","post","type-post","status-publish","format-standard","hentry","category-llm","category-llms","category-infoarticle","tag-chatgpt","tag-jamba","tag-openai-api"],"views":2547,"jetpack_sharing_enabled":true,"jetpack_featured_media_url":"","_links":{"self":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/2624","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=2624"}],"version-history":[{"count":28,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/2624\/revisions"}],"predecessor-version":[{"id":2660,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/2624\/revisions\/2660"}],"wp:attachment":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=2624"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=2624"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=2624"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}