chunk_size = 1024
chunk_overlap = 200
默认1024长度 重叠200
| 文档类型 | 推荐 chunk_size | overlap | 额外技巧 |
|---|---|---|---|
| 制度/规章(段落长、条款多) | 1 024–2 048 | 200–400 | 以“## 标题”为分隔符,保证条款完整 |
| 接口文档(URL/错误码表格) | 512–1 024 | 100–200 | 一级标题 + 二级标题拼一起,别把表格拦腰砍断 |
| 会议记录/周报(短片段) | 256–512 | 50–100 | 按“—-”或“## 周次”切,避免把多周拼一起 |
| 超长 RFC/设计文档 | 2 048–4 096 | 400 | 先按“##”切,再对子章节二次切,做双层摘要索引 |
关键词
A whoosh+jieba 10万段 chunck 约2000篇
# pip install whoosh jieba llama-index-core llama-index-retrievers-whoosh
from llama_index.core import SimpleDirectoryReader
from llama_index.retrievers.whoosh import WhooshRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
# 1. 读取文档
documents = SimpleDirectoryReader("data").load_data()
# 2. 建/载 whoosh 倒排
whoosh_retriever = WhooshRetriever(
index_dir="./whoosh_idx",
docs=documents,
top_k=5
)
# 3. 纯关键词查询引擎
query_engine = RetrieverQueryEngine(whoosh_retriever)
# 4. 交互
if __name__ == "__main__":
print("=== Whoosh 纯关键词查询 ===")
while True:
q = input("\n问题 (q退出): ").strip()
if q.lower() == "q":
break
resp = query_engine.query(q)
print("答:", resp)
B aiosqlite+fts5 100万端 约 2万篇(10万篇以内)
# pip install aiosqlite llama-index-core llama-index-retrievers-sqlite-ft
from llama_index.core import SimpleDirectoryReader
from llama_index.retrievers.sqlite_ft import SQLiteFTSRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
# 1. 读取文档
documents = SimpleDirectoryReader("data").load_data()
# 2. 建/载 FTS5 倒排(单文件 gov_docs.db)
sqlite_retriever = SQLiteFTSRetriever(
db_path="gov_docs.db",
docs=documents, # 第一次会自动建表;第二次直接复用
top_k=5
)
# 3. 纯关键词查询引擎
query_engine = RetrieverQueryEngine(sqlite_retriever)
# 4. 交互
if __name__ == "__main__":
print("=== SQLite-FTS5 纯关键词查询 ===")
while True:
q = input("\n问题 (q退出): ").strip()
if q.lower() == "q":
break
resp = query_engine.query(q)
print("答:", resp)
混合查询 关键词 +向量
pip install llama-index-core llama-index-llms-ollama llama-index-embeddings-ollama
# 关键词方案二选一
pip install llama-index-retrievers-whoosh # 路线① Whoosh
# 或
pip install llama-index-retrievers-sqlite-ft # 路线② SQLite-FTS5
代码
vector_index = VectorStoreIndex.from_documents(documents)
//向量
//PLAN A
from llama_index.retrievers.whoosh import WhooshRetriever
keyword_retriever = WhooshRetriever(
index_dir="./whoosh_idx",
docs=documents,
top_k=5
)
//PLAN B
from llama_index.retrievers.sqlite_ft import SQLiteFTSRetriever
keyword_retriever = SQLiteFTSRetriever(
db_path="gov_docs.db",
docs=documents,
top_k=5
)
//混合关键词 +向量
from llama_index.core.retrievers import QueryFusionRetriever
hybrid_retriever = QueryFusionRetriever(
retrievers=[
vector_index.as_retriever(similarity_top_k=5),
keyword_retriever,
],
retriever_weights=[0.7, 0.3], # 向量权重高一点,可按场景调
similarity_top_k=5, # 最终只给 LLM 前 5 段
num_queries=1, # 不对用户问题做改写
)
//查询
from llama_index.core.query_engine import RetrieverQueryEngine
//导入混合
query_engine = RetrieverQueryEngine(hybrid_retriever)
if __name__ == "__main__":
print("=== 向量 + 关键词 联合召回 ===")
while True:
q = input("\n问题 (q退出): ").strip()
if q.lower() == "q":
break
resp = query_engine.query(q)
print("答:", resp)
SQLITE 补充 多次document插入
documents = [] # 1. 先弄空列表
for _, afile, _ in bk: # 2. 循环取路径
if not os.path.isfile(afile):
print(f'跳过不存在文件:{afile}')
continue
documents.extend( # 3. 把每个文件拆出的文档追加进来
SimpleDirectoryReader(afile).load_data()
)
if not documents: # 4. 保险:确实读到东西了
raise SystemExit('没有有效文档,无法构建索引!')
index = VectorStoreIndex.from_documents(documents) # 5. 一次向量化