{"id":2438,"date":"2024-03-11T03:09:48","date_gmt":"2024-03-10T19:09:48","guid":{"rendered":"https:\/\/www.aqwu.net\/wp\/?p=2438"},"modified":"2024-04-28T20:02:59","modified_gmt":"2024-04-28T12:02:59","slug":"%e4%ba%86%e8%a7%a3-llama-2-%e6%a8%a1%e5%9e%8b%e7%bb%93%e6%9e%843","status":"publish","type":"post","link":"https:\/\/www.aqwu.net\/wp\/?p=2438","title":{"rendered":"\u4e86\u89e3 LLaMA-2 \u6a21\u578b\u7ed3\u6784(3)"},"content":{"rendered":"\n<h2 class=\"wp-block-heading\"><strong>7. \u6a21\u578b\u8f6c\u6362<\/strong><\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">\u628a\u6a21\u578b\u53c2\u6570\uff0c\u8f6c\u6362\u4e3a\u81ea\u5df1\u60f3\u8981\u7684\u6a21\u578b\u53c2\u6570\uff0c\u81ea\u5df1\u5b9a\u4e49\u6a21\u578b\u53c2\u6570\uff0c\u53c2\u7167 https:\/\/github.com\/karpathy\/llama2.c \u9879\u76ee\u4e0b\u7684model.py\u6587\u4ef6\uff0c\u547d\u540d\u4e3a model.py\uff0c\u6587\u4ef6\u4fdd\u5b58\u5230 newsrc \u76ee\u5f55\u4e0b\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \">import math\nimport struct\nimport inspect\nfrom dataclasses import dataclass\nfrom typing import Any, Optional, Tuple\n\nimport numpy as np\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\n\n@dataclass\nclass ModelArgs:\n    # default hyperparameters for the Llama 7B model\n    dim: int = 4096\n    n_layers: int = 32\n    n_heads: int = 32\n    n_kv_heads: Optional[int] = None\n    vocab_size: int = 32000\n    hidden_dim: Optional[int] = None\n    multiple_of: int = 256  # MLP hidden layer size will be multiple of\n    norm_eps: float = 1e-5\n    max_seq_len: int = 2048\n    dropout: float = 0.0\n\n\nclass RMSNorm(torch.nn.Module):\n    def __init__(self, dim: int, eps: float):\n        super().__init__()\n        self.eps = eps\n        self.weight = nn.Parameter(torch.ones(dim))\n\n    def _norm(self, x):\n        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)\n\n    def forward(self, x):\n        output = self._norm(x.float()).type_as(x)\n        return output * self.weight\n\n\ndef precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):\n    freqs = 1.0 \/ (theta ** (torch.arange(0, dim, 2)[: (dim \/\/ 2)].float() \/ dim))\n    t = torch.arange(end, device=freqs.device)  # type: ignore\n    freqs = torch.outer(t, freqs).float()  # type: ignore\n    freqs_cos = torch.cos(freqs)  # real part\n    freqs_sin = torch.sin(freqs)  # imaginary part\n    return freqs_cos, freqs_sin\n\ndef reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):\n    ndim = x.ndim\n    assert 0 &lt;= 1 &lt; ndim\n    assert freqs_cis.shape == (x.shape[1], x.shape[-1])\n    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]\n    return freqs_cis.view(shape)\n\ndef apply_rotary_emb(\n    xq: torch.Tensor,\n    xk: torch.Tensor,\n    freqs_cos: torch.Tensor,\n    freqs_sin: torch.Tensor\n) -&gt; Tuple[torch.Tensor, torch.Tensor]:\n\n    # reshape xq and xk to match the complex representation\n    xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)\n    xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)\n\n    # reshape freqs_cos and freqs_sin for broadcasting\n    freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)\n    freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)\n\n    # apply rotation using real numbers\n    xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin\n    xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos\n    xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin\n    xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos\n\n    # flatten last two dimensions\n    xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)\n    xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)\n\n    return xq_out.type_as(xq), xk_out.type_as(xk)\n\ndef repeat_kv(x: torch.Tensor, n_rep: int) -&gt; torch.Tensor:\n    \"\"\"torch.repeat_interleave(x, dim=2, repeats=n_rep)\"\"\"\n    bs, slen, n_kv_heads, head_dim = x.shape\n    if n_rep == 1:\n        return x\n    return (\n        x[:, :, :, None, :]\n        .expand(bs, slen, n_kv_heads, n_rep, head_dim)\n        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)\n    )\n\nclass Attention(nn.Module):\n    def __init__(self, args: ModelArgs):\n        super().__init__()\n        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads\n        assert args.n_heads % self.n_kv_heads == 0\n        model_parallel_size = 1\n        self.n_local_heads = args.n_heads \/\/ model_parallel_size\n        self.n_local_kv_heads = self.n_kv_heads \/\/ model_parallel_size\n        self.n_rep = self.n_local_heads \/\/ self.n_local_kv_heads\n        self.head_dim = args.dim \/\/ args.n_heads\n        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)\n        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)\n        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)\n        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)\n        self.attn_dropout = nn.Dropout(args.dropout)\n        self.resid_dropout = nn.Dropout(args.dropout)\n        self.dropout = args.dropout\n\n        # use flash attention or a manual implementation?\n        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')\n        if not self.flash:\n            print(\"WARNING: using slow attention. Flash Attention requires PyTorch &gt;= 2.0\")\n            mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float(\"-inf\"))\n            mask = torch.triu(mask, diagonal=1)\n            self.register_buffer(\"mask\", mask)\n\n    def forward(\n        self,\n        x: torch.Tensor,\n        freqs_cos: torch.Tensor,\n        freqs_sin: torch.Tensor,\n    ):\n        bsz, seqlen, _ = x.shape\n\n        # QKV\n        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)\n        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)\n        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)\n        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)\n\n        # RoPE relative positional embeddings\n        xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)\n\n        # grouped multiquery attention: expand out keys and values\n        xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)\n        xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)\n\n        # make heads into a batch dimension\n        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)\n        xk = xk.transpose(1, 2)\n        xv = xv.transpose(1, 2)\n\n        # flash implementation\n        if self.flash:\n            output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True)\n        else:\n            # manual implementation\n            scores = torch.matmul(xq, xk.transpose(2, 3)) \/ math.sqrt(self.head_dim)\n            assert hasattr(self, 'mask')\n            scores = scores + self.mask[:, :, :seqlen, :seqlen]   # (bs, n_local_heads, seqlen, cache_len + seqlen)\n            scores = F.softmax(scores.float(), dim=-1).type_as(xq)\n            scores = self.attn_dropout(scores)\n            output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)\n\n        # restore time as batch dimension and concat heads\n        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)\n\n        # final projection into the residual stream\n        output = self.wo(output)\n        output = self.resid_dropout(output)\n        return output\n\n\nclass FeedForward(nn.Module):\n    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):\n        super().__init__()\n        if hidden_dim is None:\n            hidden_dim = 4 * dim\n            hidden_dim = int(2 * hidden_dim \/ 3)\n            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) \/\/ multiple_of)\n        self.w1 = nn.Linear(dim, hidden_dim, bias=False)\n        self.w2 = nn.Linear(hidden_dim, dim, bias=False)\n        self.w3 = nn.Linear(dim, hidden_dim, bias=False)\n        self.dropout = nn.Dropout(dropout)\n\n    def forward(self, x):\n        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))\n\n\nclass TransformerBlock(nn.Module):\n    def __init__(self, layer_id: int, args: ModelArgs):\n        super().__init__()\n        self.n_heads = args.n_heads\n        self.dim = args.dim\n        self.head_dim = args.dim \/\/ args.n_heads\n        self.attention = Attention(args)\n        self.feed_forward = FeedForward(\n            dim=args.dim,\n            hidden_dim=args.hidden_dim,\n            multiple_of=args.multiple_of,\n            dropout=args.dropout,\n        )\n        self.layer_id = layer_id\n        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)\n        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)\n\n    def forward(self, x, freqs_cos, freqs_sin):\n        h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)\n        out = h + self.feed_forward.forward(self.ffn_norm(h))\n        return out\n\n\nclass Transformer(nn.Module):\n    last_loss: Optional[torch.Tensor]\n\n    def __init__(self, params: ModelArgs):\n        super().__init__()\n        self.params = params\n        self.vocab_size = params.vocab_size\n        self.n_layers = params.n_layers\n\n        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)\n        self.dropout = nn.Dropout(params.dropout)\n        self.layers = torch.nn.ModuleList()\n        for layer_id in range(params.n_layers):\n            self.layers.append(TransformerBlock(layer_id, params))\n        self.norm = RMSNorm(params.dim, eps=params.norm_eps)\n        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)\n\n        # share the unembedding parameters with the embedding parameters\n        self.tok_embeddings.weight = self.output.weight # https:\/\/paperswithcode.com\/method\/weight-tying\n\n        # some useful precompute for the RoPE relative positional embeddings\n        freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim \/\/ self.params.n_heads, self.params.max_seq_len)\n        self.register_buffer(\"freqs_cos\", freqs_cos, persistent=False)\n        self.register_buffer(\"freqs_sin\", freqs_sin, persistent=False)\n\n        # init all weights\n        self.apply(self._init_weights)\n        # apply special scaled init to the residual projections, per GPT-2 paper\n        for pn, p in self.named_parameters():\n            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):\n                torch.nn.init.normal_(p, mean=0.0, std=0.02\/math.sqrt(2 * params.n_layers))\n\n        # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets tensor.\n        self.last_loss = None\n\n    def _init_weights(self, module):\n        if isinstance(module, nn.Linear):\n            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n            if module.bias is not None:\n                torch.nn.init.zeros_(module.bias)\n        elif isinstance(module, nn.Embedding):\n            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n\n    def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -&gt; torch.Tensor:\n        _bsz, seqlen = tokens.shape\n        h = self.tok_embeddings(tokens)\n        h = self.dropout(h)\n        freqs_cos = self.freqs_cos[:seqlen]\n        freqs_sin = self.freqs_sin[:seqlen]\n\n        for layer in self.layers:\n            h = layer(h, freqs_cos, freqs_sin)\n        h = self.norm(h)\n\n        if targets is not None:\n            # if we are given some desired targets also calculate the loss\n            logits = self.output(h)\n            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)\n        else:\n            # inference-time mini-optimization: only forward the output on the very last position\n            logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim\n            self.last_loss = None\n\n        return logits\n\n    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):\n        # start with all of the candidate parameters\n        param_dict = {pn: p for pn, p in self.named_parameters()}\n        # filter out those that do not require grad\n        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}\n        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.\n        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.\n        decay_params = [p for n, p in param_dict.items() if p.dim() &gt;= 2]\n        nodecay_params = [p for n, p in param_dict.items() if p.dim() &lt; 2]\n        optim_groups = [\n            {'params': decay_params, 'weight_decay': weight_decay},\n            {'params': nodecay_params, 'weight_decay': 0.0}\n        ]\n        num_decay_params = sum(p.numel() for p in decay_params)\n        num_nodecay_params = sum(p.numel() for p in nodecay_params)\n        print(f\"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters\")\n        print(f\"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters\")\n        # Create AdamW optimizer and use the fused version if it is available\n        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters\n        use_fused = fused_available and device_type == 'cuda'\n        extra_args = dict(fused=True) if use_fused else dict()\n        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)\n        print(f\"using fused AdamW: {use_fused}\")\n\n        return optimizer\n\n    def estimate_mfu(self, fwdbwd_per_iter, dt):\n        \"\"\" estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS \"\"\"\n        # first estimate the number of flops we do per iteration.\n        # see PaLM paper Appendix B as ref: https:\/\/arxiv.org\/abs\/2204.02311\n        N = sum(p.numel() for p in self.parameters())\n        cfg = self.params\n        L, H, Q, T = cfg.n_layers, cfg.n_heads, cfg.dim\/\/cfg.n_heads, cfg.max_seq_len\n        flops_per_token = 6*N + 12*L*H*Q*T\n        flops_per_fwdbwd = flops_per_token * T\n        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter\n        # express our flops throughput as ratio of A100 bfloat16 peak flops\n        flops_achieved = flops_per_iter * (1.0\/dt) # per second\n        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS\n        mfu = flops_achieved \/ flops_promised\n        return mfu\n\n    @torch.inference_mode()\n    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):\n        \"\"\"\n        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete\n        the sequence max_new_tokens times, feeding the predictions back into the model each time.\n        Most likely you'll want to make sure to be in model.eval() mode of operation for this.\n        Also note this is a super inefficient version of sampling with no key\/value cache.\n        \"\"\"\n        for _ in range(max_new_tokens):\n            # if the sequence context is growing too long we must crop it at block_size\n            idx_cond = idx if idx.size(1) &lt;= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]\n            # forward the model to get the logits for the index in the sequence\n            logits = self(idx_cond)\n            logits = logits[:, -1, :] # crop to just the final time step\n            if temperature == 0.0:\n                # \"sample\" the single most likely index\n                _, idx_next = torch.topk(logits, k=1, dim=-1)\n            else:\n                # pluck the logits at the final step and scale by desired temperature\n                logits = logits \/ temperature\n                # optionally crop the logits to only the top k options\n                if top_k is not None:\n                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))\n                    logits[logits &lt; v[:, [-1]]] = -float('Inf')\n                # apply softmax to convert logits to (normalized) probabilities\n                probs = F.softmax(logits, dim=-1)\n                idx_next = torch.multinomial(probs, num_samples=1)\n            # append sampled index to the running sequence and continue\n            idx = torch.cat((idx, idx_next), dim=1)\n\n        return idx\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u4e0b\u9762\u662f\u8c03\u7528\u4ee3\u7801\uff0c\u53c2\u7167 https:\/\/github.com\/karpathy\/llama2.c \u9879\u76ee\u4e0b\u7684 export.py \u6587\u4ef6\uff0c\u547d\u540d\u4e3atest07.py\uff0c\u6587\u4ef6\u4fdd\u5b58\u5230 newsrc \u76ee\u5f55\u4e0b\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \">from transformers import AutoModelForCausalLM\nfrom model import ModelArgs, Transformer\n\nimport numpy as np\nimport torch\nfrom torch import nn\n\ndef load_hf_model(model_path):\n\n    # load HF model\n    hf_model = AutoModelForCausalLM.from_pretrained(model_path)\n    hf_dict = hf_model.state_dict()\n\n    # convert LlamaConfig to ModelArgs\n    config = ModelArgs()\n    config.dim = hf_model.config.hidden_size\n    config.n_layers = hf_model.config.num_hidden_layers\n    config.n_heads = hf_model.config.num_attention_heads\n    config.n_kv_heads = hf_model.config.num_attention_heads\n    config.vocab_size = hf_model.config.vocab_size\n    config.hidden_dim = hf_model.config.intermediate_size\n    config.norm_eps = hf_model.config.rms_norm_eps\n    config.max_seq_len = hf_model.config.max_position_embeddings\n\n    # create a new Transformer object and set weights\n    model = Transformer(config)\n\n    model.tok_embeddings.weight = nn.Parameter(hf_dict['model.embed_tokens.weight'])\n    model.norm.weight = nn.Parameter(hf_dict['model.norm.weight'])\n\n    # huggingface permutes WQ and WK, this function reverses it\n    def permute_reverse(w, n_heads=config.n_heads, dim1=config.dim, dim2=config.dim):\n        return w.view(n_heads, 2, dim1 \/\/ n_heads \/\/ 2, dim2).transpose(1, 2).reshape(dim1, dim2)\n\n    for layer in model.layers:\n        i = layer.layer_id\n        layer.attention_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.input_layernorm.weight'])\n        layer.attention.wq.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.q_proj.weight']))\n        layer.attention.wk.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.k_proj.weight']))\n        layer.attention.wv.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.v_proj.weight'])\n        layer.attention.wo.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.o_proj.weight'])\n        layer.ffn_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.post_attention_layernorm.weight'])\n        layer.feed_forward.w1.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.gate_proj.weight'])\n        layer.feed_forward.w2.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.down_proj.weight'])\n        layer.feed_forward.w3.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.up_proj.weight'])\n\n    # final classifier\n    model.output.weight = nn.Parameter(hf_dict['lm_head.weight'])\n    model.eval()\n    return model\n\n# \u6307\u5b9a\u6a21\u578b\u8def\u5f84\nmodel_path = \"meta-llama\/Llama-2-7b-chat-hf\"\n\nmodel = load_hf_model(model_path)\nprint(model)\n\nfor name, param in model.named_parameters():\n    print(f\"{name}: {param.size()}\")\n<\/pre><\/div>\n\n\n\n<p class=\"wp-block-paragraph\">\u8fd0\u884c test07.py<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:sh decode:true \" >python newsrc\/test07.py\nLoading checkpoint shards: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 2\/2 [01:25&lt;00:00, 42.94s\/it]\nTransformer(\n  (tok_embeddings): Embedding(32000, 4096)\n  (dropout): Dropout(p=0.0, inplace=False)\n  (layers): ModuleList(\n    (0-31): 32 x TransformerBlock(\n      (attention): Attention(\n        (wq): Linear(in_features=4096, out_features=4096, bias=False)\n        (wk): Linear(in_features=4096, out_features=4096, bias=False)\n        (wv): Linear(in_features=4096, out_features=4096, bias=False)\n        (wo): Linear(in_features=4096, out_features=4096, bias=False)\n        (attn_dropout): Dropout(p=0.0, inplace=False)\n        (resid_dropout): Dropout(p=0.0, inplace=False)\n      )\n      (feed_forward): FeedForward(\n        (w1): Linear(in_features=4096, out_features=11008, bias=False)\n        (w2): Linear(in_features=11008, out_features=4096, bias=False)\n        (w3): Linear(in_features=4096, out_features=11008, bias=False)\n        (dropout): Dropout(p=0.0, inplace=False)\n      )\n      (attention_norm): RMSNorm()\n      (ffn_norm): RMSNorm()\n    )\n  )\n  (norm): RMSNorm()\n  (output): Linear(in_features=4096, out_features=32000, bias=False)\n)\ntok_embeddings.weight: torch.Size([32000, 4096])\nlayers.0.attention.wq.weight: torch.Size([4096, 4096])\nlayers.0.attention.wk.weight: torch.Size([4096, 4096])\nlayers.0.attention.wv.weight: torch.Size([4096, 4096])\nlayers.0.attention.wo.weight: torch.Size([4096, 4096])\nlayers.0.feed_forward.w1.weight: torch.Size([11008, 4096])\nlayers.0.feed_forward.w2.weight: torch.Size([4096, 11008])\nlayers.0.feed_forward.w3.weight: torch.Size([11008, 4096])\nlayers.0.attention_norm.weight: torch.Size([4096])\nlayers.0.ffn_norm.weight: torch.Size([4096])\n...\nlayers.31.attention.wq.weight: torch.Size([4096, 4096])\nlayers.31.attention.wk.weight: torch.Size([4096, 4096])\nlayers.31.attention.wv.weight: torch.Size([4096, 4096])\nlayers.31.attention.wo.weight: torch.Size([4096, 4096])\nlayers.31.feed_forward.w1.weight: torch.Size([11008, 4096])\nlayers.31.feed_forward.w2.weight: torch.Size([4096, 11008])\nlayers.31.feed_forward.w3.weight: torch.Size([11008, 4096])\nlayers.31.attention_norm.weight: torch.Size([4096])\nlayers.31.ffn_norm.weight: torch.Size([4096])\nnorm.weight: torch.Size([4096])\noutput.weight: torch.Size([32000, 4096])<\/pre><\/div>\n","protected":false},"excerpt":{"rendered":"<p>7. \u6a21\u578b\u8f6c\u6362 \u628a\u6a21\u578b\u53c2\u6570\uff0c\u8f6c\u6362\u4e3a\u81ea\u5df1\u60f3\u8981\u7684\u6a21\u578b\u53c2\u6570\uff0c\u81ea\u5df1\u5b9a\u4e49\u6a21\u578b\u53c2\u6570\uff0c\u53c2\u7167 https:\/\/github.c [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","theme-transparent-header-meta":"","adv-header-id-meta":"","stick-header-meta":"","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"set","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"_jetpack_memberships_contains_paid_content":false,"footnotes":""},"categories":[313,289,443,442,312],"tags":[242,314],"class_list":["post-2438","post","type-post","status-publish","format-standard","hentry","category-chatgpt","category-gpt","category-llm","category-llms","category-openai","tag-chatgpt","tag-openai-api"],"views":1837,"jetpack_sharing_enabled":true,"jetpack_featured_media_url":"","_links":{"self":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/2438","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=2438"}],"version-history":[{"count":7,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/2438\/revisions"}],"predecessor-version":[{"id":2454,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/2438\/revisions\/2454"}],"wp:attachment":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=2438"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=2438"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=2438"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}