1.使用VectorStoreIndex简单实用:VectorStoreIndex.from_documents(documents)读取文档。Documents
2.使用读取切分好的节点(常用)VectorStoreIndex(nodes) 切分好的节点。
不常用:查询query_engine=index.as_query_engine(similarity_top=3) 查询 query_engine.query("xxxxxx?") 即可
chroma设置
首先需要导入
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,Settings,StorageContext,load_index_from_storage
创建chroma存储目录以及集合名词
# chroma初始化 存储目录
chroma_client=chromadb.PersistentClient(path="./storage/chroma")
#chroma的向量集合
chroma_context=chroma_client.get_or_create_collection("My_collection")
使用llama_index加入刚刚创建的db 中的合集转为chromavectorstore对象并最终转为Storage对象用于处理
# 使用llama_index 来使用chroma库 导入建刚刚创建的chroma集合 导入向量库
vector_store=ChromaVectorStore(chroma_collection=chroma_context)
#整合向量存储的方法 用于下面的保存操作 index=VectorStoreIndex(new_nodes,storage_context=storage_context)
storage_context=StorageContext.from_defaults(vector_store=vector_store)
向量化 及关联使用storage对象,最终使用persist方法存储 (persist_dir 其中还存储向量与节点关系数据【Chroma 存向量,persist 存关联】)
index=VectorStoreIndex(new_nodes,storage_context=storage_context)
# 该处负责存储,需要执行后才能存储 且制定了目录。必须使用persist_dir 其中还存储向量与节点关系数据
index.storage_context.persist(persist_dir="./storage")
载入chroma的合集
#并将对象转唯llamaindex到ChromeVectorStore向量对象
loaded_client=chromadb.PersistentClient(path='./storage/chroma')
loaded_collection=loaded_client.get_collection("My_collection")
#载入chroma文件中的collection到向量对象
loaded_vector_store=ChromaVectorStore(chroma_collection=loaded_collection)
# 使用StorageContext 载入转为Storage对象用于载入
loaded_storage_context=StorageContext.from_defaults(
persist_dir="./storage",
vector_store=loaded_vector_store
)
#将storage对象转为可用的index,后续用于查询
index=load_index_from_storage(loaded_storage_context)
query_engine=index.as_query_engine(similarity_top_k=3)
ee=query_engine.query("where is Link")
print(ee)
整体DEMO代码
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,Settings,StorageContext,load_index_from_storage
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers import bm25
# from llama_index.core.retrievers import BM25Retriever
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
import jieba
import time
import joblib
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
OLLAMA_URL="http://127.0.0.1:11434"
EMBED_MODEL="qwen3-embedding:0.6b"
LLM_MODEL="qwen3:0.6b"
Settings.embed_model=OllamaEmbedding(
model_name=EMBED_MODEL,base_url=OLLAMA_URL
)
Settings.llm=Ollama(
model=LLM_MODEL,base_url=OLLAMA_URL
)
# chroma初始化 存储目录
chroma_client=chromadb.PersistentClient(path="./storage/chroma")
#chroma的向量集合
chroma_context=chroma_client.get_or_create_collection("My_collection")
# 使用llama_index 来使用chroma库 导入建刚刚创建的chroma集合 导入向量库
vector_store=ChromaVectorStore(chroma_collection=chroma_context)
#整合向量存储的方法 用于下面的保存操作 index=VectorStoreIndex(new_nodes,storage_context=storage_context)
storage_context=StorageContext.from_defaults(vector_store=vector_store)
documents=SimpleDirectoryReader("data").load_data()
print(documents)
splitter=SentenceSplitter(chunk_size=512,chunk_overlap=30)
new_nodesx=splitter.get_nodes_from_documents(documents)
print(new_nodesx)
documents=SimpleDirectoryReader(input_files=["add.docx",]).load_data()
print(documents)
documents[0].metadata['MAC']='adgfa-192ga'
documents[0].metadata['document_id']=documents[0].id_
print(documents)
new_nodes=splitter.get_nodes_from_documents(documents)
print(new_nodes)
class ChineseBM25Retriever(bm25.BM25Retriever):
def _tokenize(self,text):
return [w for w in jieba.cut(text) if w.strip()]
c=time.time()
retriever=ChineseBM25Retriever(nodes=new_nodes,similarity_top_k=10)
print(time.time()-c,retriever)
# c=time.time()
# retriever=ChineseBM25Retriever(nodes=new_nodesx,similarity_top_k=10)
# print(time.time()-c,retriever)
retriever.persist("./bm25_retriever")
retrieved_nodes = retriever.retrieve(
"What is link?"
)
for node in retrieved_nodes:
print(node)
del retriever
retriever = bm25.BM25Retriever.from_persist_dir("./bm25_retriever")
print("Reload BM25 from disk")
retrieved_nodes = retriever.retrieve(
"What is link?"
)
for node in retrieved_nodes:
print(node)
index=VectorStoreIndex(new_nodes,storage_context=storage_context)
# 该处负责存储,需要执行后才能存储 且制定了目录。必须
index.storage_context.persist(persist_dir="./storage")
query_engine=index.as_query_engine(similarity_top_k=3)
ee=query_engine.query("where is Link")
print(ee)
print('Load')
#载入chroma文件中的collection到向量对象
loaded_client=chromadb.PersistentClient(path='./storage/chroma')
loaded_collection=loaded_client.get_collection("My_collection")
#并将对象转唯llamaindex到ChromeVectorStore向量对象
loaded_vector_store=ChromaVectorStore(chroma_collection=loaded_collection)
# 使用StoragContext 载入
loaded_storage_context=StorageContext.from_defaults(
persist_dir="./storage",
vector_store=loaded_vector_store
)
#转为可用的index
index=load_index_from_storage(loaded_storage_context)
query_engine=index.as_query_engine(similarity_top_k=3)
ee=query_engine.query("where is Link")
print(ee)