测试环境 windows, llama.cpp, cuda, 运行命令:
1 |
main.exe -t 16 -ngl 256 -m models\grok-1-q2_k_s-00001-of-00009.gguf --color -c 4096 --temp 0.9 --repeat_penalty 1.1 -n -1 -p "### Instruction: Write me a linked list implementation in C/C++, starting with '```' and ending with '```'\n### Response:" |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
main.exe -t 16 -ngl 256 -m models\grok-1-q2_k_s-00001-of-00009.gguf --color -c 4096 --temp 0.9 --repeat_penalty 1.1 -n -1 -p "### Instruction: Write me a linked list implementation in C/C++, starting with '```' and ending with '```'\n### Response:" Log start main: build = 2555 (d0e2f641) main: built with MSVC 19.39.33523.0 for x64 main: seed = 1711616424 llama_model_loader: additional 8 GGUFs metadata loaded. llama_model_loader: loaded meta data with 27 key-value pairs and 2114 tensors from c:\ai\grok\grok-1\models\grok-1-q2_k_s-00001-of-00009.gguf (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = grok llama_model_loader: - kv 1: general.name str = Grok llama_model_loader: - kv 2: grok.block_count u32 = 64 llama_model_loader: - kv 3: grok.context_length u32 = 4096 llama_model_loader: - kv 4: grok.embedding_length u32 = 6144 llama_model_loader: - kv 5: grok.feed_forward_length u32 = 32768 llama_model_loader: - kv 6: grok.attention.head_count u32 = 48 llama_model_loader: - kv 7: grok.attention.head_count_kv u32 = 8 llama_model_loader: - kv 8: grok.rope.freq_base f32 = 100000.000000 llama_model_loader: - kv 9: grok.attention.layer_norm_rms_epsilon f32 = 0.000010 llama_model_loader: - kv 10: grok.expert_count u32 = 8 llama_model_loader: - kv 11: grok.expert_used_count u32 = 2 llama_model_loader: - kv 12: general.file_type u32 = 10 llama_model_loader: - kv 13: tokenizer.ggml.model str = llama llama_model_loader: - kv 14: tokenizer.ggml.tokens arr[str,131072] = ["[PAD]", "[BOS]", "[EOS]", "[UNK]", ... llama_model_loader: - kv 15: tokenizer.ggml.scores arr[f32,131072] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 16: tokenizer.ggml.token_type arr[i32,131072] = [3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, ... llama_model_loader: - kv 17: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 18: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 19: tokenizer.ggml.unknown_token_id u32 = 3 llama_model_loader: - kv 20: tokenizer.ggml.padding_token_id u32 = 0 llama_model_loader: - kv 21: tokenizer.ggml.add_bos_token bool = false llama_model_loader: - kv 22: tokenizer.ggml.add_eos_token bool = false llama_model_loader: - kv 23: general.quantization_version u32 = 2 llama_model_loader: - kv 24: split.no u16 = 0 llama_model_loader: - kv 25: split.count u16 = 9 llama_model_loader: - kv 26: split.tensors.count i32 = 2114 llama_model_loader: - type f32: 257 tensors llama_model_loader: - type f16: 64 tensors llama_model_loader: - type q8_0: 128 tensors llama_model_loader: - type q2_K: 1088 tensors llama_model_loader: - type q3_K: 512 tensors llama_model_loader: - type q5_K: 64 tensors llama_model_loader: - type q6_K: 1 tensors llm_load_vocab: mismatch in special tokens definition ( 284/131072 vs 260/131072 ). llm_load_print_meta: format = GGUF V3 (latest) llm_load_print_meta: arch = grok llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 131072 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 6144 llm_load_print_meta: n_head = 48 llm_load_print_meta: n_head_kv = 8 llm_load_print_meta: n_layer = 64 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 6 llm_load_print_meta: n_embd_k_gqa = 1024 llm_load_print_meta: n_embd_v_gqa = 1024 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-05 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: f_logit_scale = 0.0e+00 llm_load_print_meta: n_ff = 32768 llm_load_print_meta: n_expert = 8 llm_load_print_meta: n_expert_used = 2 llm_load_print_meta: causal attn = 1 llm_load_print_meta: pooling type = 0 llm_load_print_meta: rope type = 2 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 100000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: ssm_d_conv = 0 llm_load_print_meta: ssm_d_inner = 0 llm_load_print_meta: ssm_d_state = 0 llm_load_print_meta: ssm_dt_rank = 0 llm_load_print_meta: model type = 314B llm_load_print_meta: model ftype = Q2_K - Medium llm_load_print_meta: model params = 315.68 B llm_load_print_meta: model size = 107.96 GiB (2.94 BPW) llm_load_print_meta: general.name = Grok llm_load_print_meta: BOS token = 1 '[BOS]' llm_load_print_meta: EOS token = 2 '[EOS]' llm_load_print_meta: UNK token = 3 '[UNK]' llm_load_print_meta: PAD token = 0 '[PAD]' llm_load_print_meta: LF token = 79 '<0x0A>' ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes ggml_cuda_init: found 8 CUDA devices: Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes Device 1: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes Device 2: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes Device 3: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes Device 4: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes Device 5: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes Device 6: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes Device 7: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes llm_load_tensors: ggml ctx size = 7.26 MiB llm_load_tensors: offloading 64 repeating layers to GPU llm_load_tensors: offloading non-repeating layers to GPU llm_load_tensors: offloaded 65/65 layers to GPU llm_load_tensors: CPU buffer size = 630.00 MiB llm_load_tensors: CUDA0 buffer size = 15457.50 MiB llm_load_tensors: CUDA1 buffer size = 13740.00 MiB llm_load_tensors: CUDA2 buffer size = 13740.00 MiB llm_load_tensors: CUDA3 buffer size = 13740.00 MiB llm_load_tensors: CUDA4 buffer size = 13740.00 MiB llm_load_tensors: CUDA5 buffer size = 13740.00 MiB llm_load_tensors: CUDA6 buffer size = 13740.00 MiB llm_load_tensors: CUDA7 buffer size = 12652.52 MiB .................................................................................................... llama_new_context_with_model: n_ctx = 4096 llama_new_context_with_model: n_batch = 2048 llama_new_context_with_model: n_ubatch = 512 llama_new_context_with_model: freq_base = 100000.0 llama_new_context_with_model: freq_scale = 1 llama_kv_cache_init: CUDA0 KV buffer size = 144.00 MiB llama_kv_cache_init: CUDA1 KV buffer size = 128.00 MiB llama_kv_cache_init: CUDA2 KV buffer size = 128.00 MiB llama_kv_cache_init: CUDA3 KV buffer size = 128.00 MiB llama_kv_cache_init: CUDA4 KV buffer size = 128.00 MiB llama_kv_cache_init: CUDA5 KV buffer size = 128.00 MiB llama_kv_cache_init: CUDA6 KV buffer size = 128.00 MiB llama_kv_cache_init: CUDA7 KV buffer size = 112.00 MiB llama_new_context_with_model: KV self size = 1024.00 MiB, K (f16): 512.00 MiB, V (f16): 512.00 MiB llama_new_context_with_model: CUDA_Host output buffer size = 0.50 MiB llama_new_context_with_model: pipeline parallelism enabled (n_copies=4) llama_new_context_with_model: CUDA0 compute buffer size = 564.04 MiB llama_new_context_with_model: CUDA1 compute buffer size = 564.04 MiB llama_new_context_with_model: CUDA2 compute buffer size = 564.04 MiB llama_new_context_with_model: CUDA3 compute buffer size = 564.04 MiB llama_new_context_with_model: CUDA4 compute buffer size = 564.04 MiB llama_new_context_with_model: CUDA5 compute buffer size = 564.04 MiB llama_new_context_with_model: CUDA6 compute buffer size = 564.04 MiB llama_new_context_with_model: CUDA7 compute buffer size = 564.05 MiB llama_new_context_with_model: CUDA_Host compute buffer size = 44.02 MiB llama_new_context_with_model: graph nodes = 3784 llama_new_context_with_model: graph splits = 9 system_info: n_threads = 16 / 64 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 0 | sampling: repeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000 top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.900 mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 sampling order: CFG -> Penalties -> top_k -> tfs_z -> typical_p -> top_p -> min_p -> temperature generate: n_ctx = 4096, n_batch = 2048, n_predict = -1, n_keep = 0 ### Instruction: Write me a linked list implementation in C/C++, starting with '```' and ending with '```'\n### Response: <details><summary>Click to expand</summary> <p> #include <stdio.h> #include <stdlib.h> struct node { int data; struct node *next; }; void push(struct node **head, int value) { struct node *node = malloc(sizeof(struct node)); node->data = value; node->next = *head; *head = node; } int main() { // Create a new head pointer for the list. struct node **list = (struct node **) calloc(1, sizeof(struct node *)); push(&(*list), 0); // Insert a value into the list. for (int i = 1; i < 8; i++) { // Create new nodes for each element in array and add to linked-list. if (!i) continue; push(list, i); } struct node *current = *list; while (current->next) { printf("%d ", current->data); current = (*list)->next; } printf("\n"); // 1234567 } </p> </details> [end of text] llama_print_timings: load time = 142096.82 ms llama_print_timings: sample time = 174.00 ms / 281 runs ( 0.62 ms per token, 1614.90 tokens per second) llama_print_timings: prompt eval time = 2152.91 ms / 29 tokens ( 74.24 ms per token, 13.47 tokens per second) llama_print_timings: eval time = 257631.30 ms / 280 runs ( 920.11 ms per token, 1.09 tokens per second) llama_print_timings: total time = 260199.48 ms / 309 tokens Log end |