需要安装 python-docx
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers import bm25
documents=SimpleDirectoryReader("data").load_data()
print(documents)
splitter=SentenceSplitter(chunk_size=512,chunk_overlap=30)
new_nodes=splitter.get_nodes_from_documents(documents)
print(new_nodes)
documents=SimpleDirectoryReader(input_files=["add.docx",]).load_data()
#input_files接受的是一个文件目录
print(documents)
documents[0].metadata['MAC']='adgfa-192ga'
#添加metadata
documents[0].metadata['document_id']=documents[0].id_
#默认没有documenta_id 需要自己添加
print(documents)
new_nodes=splitter.get_nodes_from_documents(documents)
print(new_nodes)