试验一 : 读取docx 单文件 文件目录 添加metadata

需要安装 python-docx

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers import bm25

documents=SimpleDirectoryReader("data").load_data()

print(documents)

splitter=SentenceSplitter(chunk_size=512,chunk_overlap=30)

new_nodes=splitter.get_nodes_from_documents(documents)

print(new_nodes)

documents=SimpleDirectoryReader(input_files=["add.docx",]).load_data()
#input_files接受的是一个文件目录

print(documents)

documents[0].metadata['MAC']='adgfa-192ga'
#添加metadata
documents[0].metadata['document_id']=documents[0].id_
#默认没有documenta_id 需要自己添加
print(documents)
new_nodes=splitter.get_nodes_from_documents(documents)
print(new_nodes)

发表回复