【AI-Snova】数据提取(Data Extraction)--Markdown extraction

写在前面

  • examples of text extraction from MD files with different packages

    从具有不同包的 MD 文件中提取文本的示例

Table of contents

依赖引入

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import os
import sys

current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, ".."))
repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

import glob
import pandas as pd
from dotenv import load_dotenv
from langchain.text_splitter import MarkdownHeaderTextSplitter
from tqdm.autonotebook import trange

加载Markdown文件

1
2
3
folder_loc = kit_dir
md_files = list(glob.glob(f'{folder_loc}/*.md'))
file_path = md_files[0]

从非结构化本地 MD 加载程序加载

1
2
3
4
5
6
from langchain.document_loaders import UnstructuredMarkdownLoader

loader = UnstructuredMarkdownLoader(file_path, mode="elements")
docs_unstructured_local = loader.load()
for doc in docs_unstructured_local[:10]:
print(f'{doc.page_content}\n---')

从非结构化 io API 加载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from langchain.document_loaders import UnstructuredAPIFileLoader
# 需要在Unstructured.io注册获得免费API Key
load_dotenv(os.path.join(repo_dir,'.env'))


loader = UnstructuredAPIFileLoader(file_path,
mode="elements",
api_key=os.environ.get('UNSTRUCTURED_API_KEY'),
url=os.environ.get("UNSTRUCTURED_URL"))


docs_unstructured_api = loader.load()


for doc in docs_unstructured_api:
print(f'{doc.page_content}\n---')

通过嵌入相似性来评估 loded docs

嵌入和存储

1
2
3
4
5
6
7
8
9
10
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS

encode_kwargs = {'normalize_embeddings': True}
embd_model = HuggingFaceInstructEmbeddings( model_name='intfloat/e5-large-v2',
embed_instruction="", # no instructions needed for candidate passages
query_instruction="Represent this sentence for searching relevant passages: ",
encode_kwargs=encode_kwargs)
vectorstore_unstructured_local = FAISS.from_documents(documents=docs_unstructured_local, embedding=embd_model)
vectorstore_unstructured_api = FAISS.from_documents(documents=docs_unstructured_api, embedding=embd_model)

相似度搜索

1
2
3
4
5
6
7
8
9
10
query = "how I clone the repo?"

ans = vectorstore_unstructured_local.similarity_search(query)
print("-------Unstructured local Loader----------\n")
print(ans[0].page_content)


ans_2 = vectorstore_unstructured_api.similarity_search(query)
print("--------Unstructured api loader------------\n")
print(ans_2[0].page_content)
  • Copyrights © 2024-2025 brocademaple
  • 访问人数: | 浏览次数:

      请我喝杯咖啡吧~

      支付宝
      微信