{"id":4546,"date":"2024-08-21T22:06:52","date_gmt":"2024-08-21T14:06:52","guid":{"rendered":"https:\/\/www.aqwu.net\/wp\/?p=4546"},"modified":"2024-08-21T23:40:17","modified_gmt":"2024-08-21T15:40:17","slug":"%e4%bd%bf%e7%94%a8-lora-%e5%be%ae%e8%b0%83%e4%b8%80%e4%b8%aa-pdf-%e4%bd%bf%e7%94%a8%e6%89%8b%e5%86%8c%ef%bc%882%ef%bc%89","status":"publish","type":"post","link":"https:\/\/www.aqwu.net\/wp\/?p=4546","title":{"rendered":"\u4f7f\u7528 loRA \u5fae\u8c03\u4e00\u4e2a PDF \u4f7f\u7528\u624b\u518c\uff082\uff09"},"content":{"rendered":"\n<p>\u4e0a\u4e00\u8282\u8bb2\u5230 <strong><a href=\"https:\/\/www.aqwu.net\/wp\/?p=4541\">\u628a pdf \u6587\u4ef6\u8f6c\u6362\u4e3a md \u6587\u4ef6<\/a><\/strong><\/p>\n\n\n\n<p>\u8fd9\u4e00\u8282\u4ecb\u7ecd\u628a md \u6587\u4ef6\u8f6c\u6362\u4e3a jsonl \u6587\u4ef6\uff0c\u4fbf\u4e8e\u5fae\u8c03\u4f7f\u7528<\/p>\n\n\n\n<p>pdf \u6587\u4ef6\u8f6c\u6362 md \u6587\u4ef6\u540e\uff0c\u5927\u6982\u7684\u683c\u5f0f\u5982\u4e0b\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" >\u8fd9\u662f\u6587\u6863\u7684\u7b2c\u4e00\u53e5\u8bdd\uff0c\u6ca1\u6709\u4efb\u4f55\u6807\u9898\u6807\u6ce8\u3002\n# \u7b2c\u4e00\u90e8\u5206\n\u8fd9\u91cc\u662f\u7b2c\u4e00\u90e8\u5206\u7684\u5185\u5bb9\u3002\n# \u7b2c\u4e8c\u90e8\u5206\n\u8fd9\u91cc\u662f\u7b2c\u4e8c\u90e8\u5206\u7684\u5185\u5bb9\u3002\n<\/pre><\/div>\n\n\n\n<p>\u4e3a\u4e86\u4fdd\u8bc1\u6570\u636e\u7684\u6709\u6548\u6027\uff0c\u63d0\u4f9b\u7684\u6587\u6863\u9700\u8981\u8fdb\u884c\u7b2c\u4e8c\u6b21\u5904\u7406<\/p>\n\n\n\n<p>\u6211\u4eec\u5fae\u8c03\u7684\u6587\u6863\u5927\u90e8\u5206\u662f\u82f1\u6587\u7684\uff0c\u5c3d\u91cf\u4fdd\u6301\u539f\u7406\u82f1\u6587\u7684\u903b\u8f91\uff0c\u4f46\u662f\u9700\u8981\u5206\u6bb5\uff0c\u5206\u6bb5\u4e5f\u662f\u4e00\u4e2a\u6280\u5de7\u3002<\/p>\n\n\n\n<p>\u6bcf\u4e00\u6bb5\u9700\u8981\u6700\u591a\u591a\u5c11\u5b57\u8282\uff0c\u5982\u4f55\u5206\u6bb5\uff1f<\/p>\n\n\n\n<p>\u8fd9\u91cc\u662f\u8fd9\u6837\u5206\u6bb5\uff0c\u6309 md \u7684\u5206\u6bb5\u903b\u8f91\u6765\u3002<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\u4ee5 # \u5f00\u59cb\u4e3a\u4e00\u6bb5\uff0c\u4f46\u662f\u8981\u4fdd\u6301\u5b57\u8282\u8db3\u591f\u591a\uff0c\u53ef\u4ee5\u7ee7\u7eed\u4e0b\u4e00\u4e2a #\uff0c <\/li>\n\n\n\n<li>\u4f46\u662f\u603b\u5b57\u8282\u6570\u4e0d\u80fd\u8d85\u8fc7 2048\u4e2a token\uff08\u4e0d\u662f\u5b57\u8282\uff0c\u8fd9\u4e2a\u4f60\u53ef\u4ee5\u4fee\u6539\uff09<\/li>\n\n\n\n<li>\u5982\u679c\u6bcf\u4e00\u6bb5\u8d85\u8fc72048\u4e2a\u5b57\u8282\uff0c\u5219\u4f1a\u6309\u884c\u518d\u5206\u5f00<\/li>\n\n\n\n<li>\u4f7f\u7528\u7684\u6a21\u578b\u662f <a href=\"https:\/\/huggingface.co\/Qwen\/Qwen2-7B-Instruct\">https:\/\/huggingface.co\/Qwen\/Qwen2-7B-Instruct<\/a><\/li>\n<\/ol>\n\n\n\n<p>\u5728\u5fae\u8c03\u5927\u578b\u8bed\u8a00\u6a21\u578b\uff08\u5982Qwen-2\uff09\u65f6\uff0c\u901a\u5e38\u4f1a\u6309<strong>token<\/strong>\u6765\u5904\u7406\uff0c\u800c\u4e0d\u662f\u5b57\u8282\u3002\u8fd9\u662f\u56e0\u4e3a\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>Token\u662f\u6a21\u578b\u7684\u57fa\u672c\u5904\u7406\u5355\u5143<\/strong>\uff1a\u8bed\u8a00\u6a21\u578b\u5728\u8f93\u5165\u6587\u672c\u65f6\u4f1a\u5c06\u5176\u62c6\u5206\u4e3atoken\uff08\u901a\u5e38\u662f\u8bcd\u3001\u8bcd\u7684\u4e00\u90e8\u5206\u6216\u5b57\u7b26\uff09\u3002\u4e0d\u540c\u7684tokenizer\uff08\u5982BPE\u3001WordPiece\uff09\u6709\u4e0d\u540c\u7684\u65b9\u5f0f\u751f\u6210token\uff0c\u56e0\u6b64token\u6570\u76f4\u63a5\u51b3\u5b9a\u6a21\u578b\u7684\u8f93\u5165\u957f\u5ea6\u3002<\/li>\n\n\n\n<li><strong>Token\u4e0e\u5b57\u8282\u7684\u5173\u7cfb<\/strong>\uff1a\u867d\u7136\u5b57\u8282\u6570\u5728\u6587\u672c\u7f16\u7801\u548c\u5b58\u50a8\u4e2d\u6709\u610f\u4e49\uff0c\u4f46\u5728\u8bed\u8a00\u6a21\u578b\u4e2d\uff0c\u6a21\u578b\u5b9e\u9645\u5904\u7406\u7684\u662ftoken\u3002\u5373\u4f7f\u4e24\u4e2a\u6587\u672c\u7684\u5b57\u8282\u6570\u76f8\u540c\uff0ctoken\u6570\u53ef\u80fd\u4f1a\u4e0d\u540c\uff0c\u56e0\u4e3a\u540c\u4e00\u4e2a\u5b57\u7b26\u5e8f\u5217\u5728\u4e0d\u540c\u7684\u4e0a\u4e0b\u6587\u4e2d\u53ef\u80fd\u88ab\u5207\u5206\u6210\u4e0d\u540c\u6570\u91cf\u7684token\u3002<\/li>\n\n\n\n<li><strong>\u6700\u5927\u957f\u5ea6\u9650\u5236<\/strong>\uff1a\u5fae\u8c03\u8fc7\u7a0b\u4e2d\uff0c\u8f93\u5165\u5e8f\u5217\u7684\u957f\u5ea6\u9650\u5236\u662f\u57fa\u4e8etoken\u7684\u3002\u4f8b\u5982\uff0c\u5927\u591a\u6570\u6a21\u578b\u5728\u5904\u7406\u65f6\u4f1a\u6709\u6700\u5927token\u6570\uff08\u59822048\u62164096 tokens\uff09\u7684\u9650\u5236\uff0c\u800c\u4e0d\u662f\u76f4\u63a5\u6839\u636e\u5b57\u8282\u6570\u3002<\/li>\n<\/ol>\n\n\n\n<p>\u4e0b\u9762\u662f\u793a\u4f8b\u4ee3\u7801\uff1a<\/p>\n\n\n\n<div class=\"wp-block-urvanov-syntax-highlighter-code-block\"><pre class=\"lang:python decode:true \" >import re\nimport json\nfrom transformers import AutoTokenizer\n\n# \u521d\u59cb\u5316tokenizer\ntokenizer = AutoTokenizer.from_pretrained(\"Qwen\/Qwen2-7B-Instruct\")\n\ndef split_by_headers(md_content):\n    # \u5224\u65ad\u6587\u6863\u5f00\u5934\u662f\u5426\u6709\u6807\u9898\n    if not md_content.startswith(\"#\"):\n        first_part, *rest = re.split(r'(?=^#)', md_content, flags=re.MULTILINE)\n        sections = [first_part.strip()] + rest\n    else:\n        sections = re.split(r'(?=^#)', md_content, flags=re.MULTILINE)\n    \n    return [section.strip() for section in sections if section.strip()]\n\ndef split_long_section(section, max_length=2048):\n    lines = section.split(\"\\n\")\n    split_sections = []\n    current_chunk = \"\"\n\n    for line in lines:\n        temp_chunk = current_chunk + \"\\n\" + line if current_chunk else line\n        token_length = len(tokenizer.encode(temp_chunk, add_special_tokens=False))\n\n        if token_length &lt;= max_length:\n            current_chunk = temp_chunk\n        else:\n            split_sections.append(current_chunk)\n            current_chunk = line\n\n    if current_chunk:\n        split_sections.append(current_chunk)\n\n    return split_sections\n\ndef merge_sections(sections, max_length=2048):\n    merged_sections = []\n    current_section = sections[0]\n\n    for next_section in sections[1:]:\n        temp_section = current_section + \"\\n\" + next_section\n        token_length = len(tokenizer.encode(temp_section, add_special_tokens=False))\n\n        if token_length &lt;= max_length:\n            current_section = temp_section\n        else:\n            # \u5982\u679c\u5408\u5e76\u540e\u7684\u6bb5\u843d\u8d85\u8fc7\u9650\u5236\uff0c\u5148\u5bf9\u5f53\u524d\u6bb5\u843d\u8fdb\u884c\u5904\u7406\n            if len(tokenizer.encode(current_section, add_special_tokens=False)) &gt; max_length:\n                merged_sections.extend(split_long_section(current_section, max_length))\n            else:\n                merged_sections.append(current_section)\n            current_section = next_section\n\n    # \u6700\u540e\u5904\u7406\u5269\u4f59\u6bb5\u843d\n    if len(tokenizer.encode(current_section, add_special_tokens=False)) &gt; max_length:\n        merged_sections.extend(split_long_section(current_section, max_length))\n    else:\n        merged_sections.append(current_section)\n\n    return merged_sections\n\ndef process_markdown_file(md_file, max_length=2048):\n    with open(md_file, 'r', encoding='utf-8') as file:\n        md_content = file.read()\n\n    sections = split_by_headers(md_content)\n    merged_sections = merge_sections(sections, max_length)\n    \n    return merged_sections\n\ndef save_to_jsonl(sections, output_file):\n    with open(output_file, 'w', encoding='utf-8') as f:\n        for section in sections:\n            token_length = len(tokenizer.encode(section, add_special_tokens=False))\n            byte_length = len(section.encode('utf-8'))\n            json_line = json.dumps({\n                \"token_length\": token_length,\n                \"byte_length\": byte_length,\n                \"content\": section\n            }, ensure_ascii=False)\n            f.write(json_line + \"\\n\")\n\n# \u4f7f\u7528\u793a\u4f8b\nsplit_sections = process_markdown_file(\"output\/EX_SAG_10x\/EX_SAG_10x.md\", max_length=2048)\nsave_to_jsonl(split_sections, \"EX_SAG_10x.jsonl\")\n<\/pre><\/div>\n","protected":false},"excerpt":{"rendered":"<p>\u4e0a\u4e00\u8282\u8bb2\u5230 \u628a pdf \u6587\u4ef6\u8f6c\u6362\u4e3a md \u6587\u4ef6 \u8fd9\u4e00\u8282\u4ecb\u7ecd\u628a md \u6587\u4ef6\u8f6c\u6362\u4e3a jsonl \u6587\u4ef6\uff0c\u4fbf\u4e8e\u5fae\u8c03\u4f7f\u7528 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","theme-transparent-header-meta":"","adv-header-id-meta":"","stick-header-meta":"","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"set","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"_jetpack_memberships_contains_paid_content":false,"footnotes":""},"categories":[444,443,442],"tags":[395,450],"class_list":["post-4546","post","type-post","status-publish","format-standard","hentry","category-ai","category-llm","category-llms","tag-lora","tag-pdf"],"views":2132,"jetpack_sharing_enabled":true,"jetpack_featured_media_url":"","_links":{"self":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/4546","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=4546"}],"version-history":[{"count":12,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/4546\/revisions"}],"predecessor-version":[{"id":4558,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=\/wp\/v2\/posts\/4546\/revisions\/4558"}],"wp:attachment":[{"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=4546"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=4546"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.aqwu.net\/wp\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=4546"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}