llamaindex 向量化VectorStoreIndex

1.使用VectorStoreIndex简单实用：VectorStoreIndex.from_documents(documents)读取文档。Documents

2.使用读取切分好的节点（常用）VectorStoreIndex(nodes) 切分好的节点。

不常用：查询query_engine=index.as_query_engine(similarity_top=3) 查询 query_engine.query("xxxxxx?") 即可

chroma设置

首先需要导入

import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore


from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,Settings,StorageContext,load_index_from_storage

创建chroma存储目录以及集合名词

# chroma初始化 存储目录
chroma_client=chromadb.PersistentClient(path="./storage/chroma")
#chroma的向量集合
chroma_context=chroma_client.get_or_create_collection("My_collection")

使用llama_index加入刚刚创建的db 中的合集转为chromavectorstore对象并最终转为Storage对象用于处理

# 使用llama_index 来使用chroma库 导入建刚刚创建的chroma集合 导入向量库
vector_store=ChromaVectorStore(chroma_collection=chroma_context)
#整合向量存储的方法 用于下面的保存操作 index=VectorStoreIndex(new_nodes,storage_context=storage_context)
storage_context=StorageContext.from_defaults(vector_store=vector_store)

向量化及关联使用storage对象，最终使用persist方法存储（persist_dir 其中还存储向量与节点关系数据【Chroma 存向量，persist 存关联】）

index=VectorStoreIndex(new_nodes,storage_context=storage_context)
# 该处负责存储，需要执行后才能存储 且制定了目录。必须使用persist_dir 其中还存储向量与节点关系数据
index.storage_context.persist(persist_dir="./storage")

载入chroma的合集

#并将对象转唯llamaindex到ChromeVectorStore向量对象

loaded_client=chromadb.PersistentClient(path='./storage/chroma')

loaded_collection=loaded_client.get_collection("My_collection")

#载入chroma文件中的collection到向量对象

loaded_vector_store=ChromaVectorStore(chroma_collection=loaded_collection)

# 使用StorageContext 载入转为Storage对象用于载入

loaded_storage_context=StorageContext.from_defaults(

    persist_dir="./storage",

    vector_store=loaded_vector_store

)

#将storage对象转为可用的index，后续用于查询

index=load_index_from_storage(loaded_storage_context)

query_engine=index.as_query_engine(similarity_top_k=3)

ee=query_engine.query("where is Link")

print(ee)

整体DEMO代码

from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,Settings,StorageContext,load_index_from_storage
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers import bm25
# from llama_index.core.retrievers import BM25Retriever
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
import jieba
import time
import joblib
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore


OLLAMA_URL="http://127.0.0.1:11434"
EMBED_MODEL="qwen3-embedding:0.6b"
LLM_MODEL="qwen3:0.6b"

Settings.embed_model=OllamaEmbedding(
    model_name=EMBED_MODEL,base_url=OLLAMA_URL
)
Settings.llm=Ollama(
    model=LLM_MODEL,base_url=OLLAMA_URL
)

# chroma初始化 存储目录
chroma_client=chromadb.PersistentClient(path="./storage/chroma")
#chroma的向量集合
chroma_context=chroma_client.get_or_create_collection("My_collection")

# 使用llama_index 来使用chroma库 导入建刚刚创建的chroma集合 导入向量库
vector_store=ChromaVectorStore(chroma_collection=chroma_context)
#整合向量存储的方法 用于下面的保存操作 index=VectorStoreIndex(new_nodes,storage_context=storage_context)
storage_context=StorageContext.from_defaults(vector_store=vector_store)

documents=SimpleDirectoryReader("data").load_data()

print(documents)

splitter=SentenceSplitter(chunk_size=512,chunk_overlap=30)

new_nodesx=splitter.get_nodes_from_documents(documents)

print(new_nodesx)

documents=SimpleDirectoryReader(input_files=["add.docx",]).load_data()

print(documents)

documents[0].metadata['MAC']='adgfa-192ga'
documents[0].metadata['document_id']=documents[0].id_

print(documents)
new_nodes=splitter.get_nodes_from_documents(documents)
print(new_nodes)

class ChineseBM25Retriever(bm25.BM25Retriever):
    def _tokenize(self,text):
        return [w for w in jieba.cut(text) if w.strip()]

c=time.time()
retriever=ChineseBM25Retriever(nodes=new_nodes,similarity_top_k=10)
print(time.time()-c,retriever)

# c=time.time()
# retriever=ChineseBM25Retriever(nodes=new_nodesx,similarity_top_k=10)
# print(time.time()-c,retriever)
retriever.persist("./bm25_retriever")


retrieved_nodes = retriever.retrieve(
    "What is link?"
)
for node in retrieved_nodes:
    print(node)


del retriever
retriever = bm25.BM25Retriever.from_persist_dir("./bm25_retriever")

print("Reload BM25 from disk")
retrieved_nodes = retriever.retrieve(
    "What is link?"
)
for node in retrieved_nodes:
    print(node)


index=VectorStoreIndex(new_nodes,storage_context=storage_context)
# 该处负责存储，需要执行后才能存储 且制定了目录。必须
index.storage_context.persist(persist_dir="./storage")

query_engine=index.as_query_engine(similarity_top_k=3)
ee=query_engine.query("where is Link")
print(ee)

print('Load')

#载入chroma文件中的collection到向量对象
loaded_client=chromadb.PersistentClient(path='./storage/chroma')
loaded_collection=loaded_client.get_collection("My_collection")
#并将对象转唯llamaindex到ChromeVectorStore向量对象
loaded_vector_store=ChromaVectorStore(chroma_collection=loaded_collection)

# 使用StoragContext 载入
loaded_storage_context=StorageContext.from_defaults(
    persist_dir="./storage",
    vector_store=loaded_vector_store
)

#转为可用的index
index=load_index_from_storage(loaded_storage_context)
query_engine=index.as_query_engine(similarity_top_k=3)
ee=query_engine.query("where is Link")
print(ee)

llamaindex 向量化VectorStoreIndex

chroma设置

创建chroma存储目录以及集合名词

载入chroma的合集

整体DEMO代码

发表回复取消回复

近期文章

近期评论

归档

分类

友情链接

其他操作

llamaindex 向量化VectorStoreIndex

chroma设置

创建chroma存储目录以及集合名词

载入chroma的合集

整体DEMO代码

发表回复 取消回复

近期文章

近期评论

归档

分类

友情链接

其他操作

发表回复取消回复