安装CNORCR
pip install llama-index-readers-file pymupdf docx2txt openpyxl pandas
pip install cnocr[ort-cpu]
pip install watchdog
CNOCR下载模型,先创建脚本 自动下载:
# 在一台能上网的机器执行一次即可
from cnocr import CnOcr
_ = CnOcr() # 首次会自动把检测+识别模型
# 下载到 ~/cnocr/2.3 目录
# 把整个 ~/cnocr 文件夹拷到离线机同名路径即可
执行
# ================= 依赖安装 =================
# pip install llama-index llama-index-llms-ollama llama-index-embeddings-ollama
# pip install cnocr watchdog fitz docx2txt pandas openpyxl
from pathlib import Path
import time, hashlib
from llama_index.core import Settings, Document, VectorStoreIndex
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from cnocr import CnOcr
import fitz, docx2txt, pandas as pd
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
# ================= 全局模型 =================
Settings.llm = Ollama(model="gemma:4b", request_timeout=600)
Settings.embed_model = OllamaEmbedding(model_name="embeddinggemma")
ocr = CnOcr(root="~/cnocr/2.3") # 预下载权重目录
# ================= 文件读取器 =================
class LocalFileReader:
def load_data(self, file_path: str):
p = Path(file_path)
if p.suffix.lower() == ".pdf":
doc = fitz.open(p)
text = ""
for page in doc:
out = ocr.ocr(page.get_pixmap().tobytes("png"))
text += "\n".join(["".join(line) for line in out])
return [Document(text=text, metadata={"file_name": p.name})]
if p.suffix.lower() in [".docx", ".doc"]:
txt = docx2txt.process(p)
return [Document(text=txt, metadata={"file_name": p.name})]
if p.suffix.lower() in [".xlsx", ".xls"]:
dfs = pd.read_excel(p, sheet_name=None)
txt = "\n".join(f"【{s}】\n{d.to_csv(index=False, sep='\t')}" for s, d in dfs.items())
return [Document(text=txt, metadata={"file_name": p.name})]
if p.suffix.lower() in [".png", ".jpg", ".jpeg"]:
out = ocr.ocr(str(p))
txt = "\n".join(["".join(line) for line in out])
return [Document(text=txt, metadata={"file_name": p.name})]
# txt / md
from llama_index.core import SimpleDirectoryReader
return SimpleDirectoryReader(input_files=[p]).load_data()
# ================= 工具:计算文件 hash =================
def file_hash(path: Path, chunk=8192):
h = hashlib.md5()
with open(path, "rb") as f:
while chunk := f.read(chunk):
h.update(chunk)
return h.hexdigest()
# ================= 目录扫描 + 索引构建 =================
DATA_DIR = Path("data")
SUPPORTED = (".pdf", ".docx", ".doc", ".xlsx", ".xls", ".png", ".jpg", ".jpeg", ".txt", ".md")
def build_index():
all_docs = []
for p in DATA_DIR.rglob("*"):
if p.is_file() and p.suffix.lower() in SUPPORTED:
all_docs += LocalFileReader().load_data(p)
return VectorStoreIndex.from_documents(all_docs)
# ================= 监视器:文件变化自动重建 =================
class DataHandler(FileSystemEventHandler):
def __init__(self, rebuild_func):
self.rebuild = rebuild_func
def on_any_event(self, event):
if event.is_directory:
return
if Path(event.src_path).suffix.lower() in SUPPORTED:
print(f"[watchdog] {event.event_type} -> {event.src_path}")
self.rebuild()
def rebuild():
global query_engine
print("[rebuild] 正在重新索引...")
new_index = build_index()
query_engine = new_index.as_query_engine()
print("[rebuild] 完成!")
# ================= 首次建库 =================
query_engine = build_index().as_query_engine()
# ================= 启动监视 =================
observer = Observer()
observer.schedule(DataHandler(rebuild), str(DATA_DIR), recursive=True)
observer.start()
# ================= 交互 / 测试 =================
if __name__ == "__main__":
try:
while True:
q = input("\n请输入问题(q 退出):").strip()
if q.lower() == "q":
break
resp = query_engine.query(q)
print(resp)
except KeyboardInterrupt:
pass
finally:
observer.stop()
observer.join()
FASTAPI 开放API
from pathlib import Path
from fastapi import FastAPI
from pydantic import BaseModel
from llama_index.core import Settings, Document, VectorStoreIndex
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from cnocr import CnOcr
import fitz, docx2txt, pandas as pd
# ---------------- 模型 ----------------
Settings.llm = Ollama(model="gemma:4b", request_timeout=600)
Settings.embed_model = OllamaEmbedding(model_name="embeddinggemma")
ocr = CnOcr(root="~/cnocr/2.3")
# ---------------- 文件读取器 ----------------
class LocalFileReader:
def load_data(self, file_path: str):
p = Path(file_path)
if p.suffix.lower() == ".pdf":
doc = fitz.open(p)
text = ""
for page in doc:
out = ocr.ocr(page.get_pixmap().tobytes("png"))
text += "\n".join(["".join(line) for line in out])
return [Document(text=text, metadata={"file_name": p.name})]
if p.suffix.lower() in [".docx", ".doc"]:
txt = docx2txt.process(p)
return [Document(text=txt, metadata={"file_name": p.name})]
if p.suffix.lower() in [".xlsx", ".xls"]:
dfs = pd.read_excel(p, sheet_name=None)
txt = "\n".join(f"【{s}】\n{d.to_csv(index=False, sep='\t')}" for s, d in dfs.items())
return [Document(text=txt, metadata={"file_name": p.name})]
if p.suffix.lower() in [".png", ".jpg", ".jpeg"]:
out = ocr.ocr(str(p))
txt = "\n".join(["".join(line) for line in out])
return [Document(text=txt, metadata={"file_name": p.name})]
from llama_index.core import SimpleDirectoryReader
return SimpleDirectoryReader(input_files=[p]).load_data()
# ---------------- 建索引(启动时一次) ----------------
DATA_DIR = Path("data")
SUPPORTED = (".pdf", ".docx", ".doc", ".xlsx", ".xls", ".png", ".jpg", ".jpeg", ".txt", ".md")
all_docs = []
for p in DATA_DIR.rglob("*"):
if p.is_file() and p.suffix.lower() in SUPPORTED:
all_docs += LocalFileReader().load_data(p)
index = VectorStoreIndex.from_documents(all_docs)
query_engine = index.as_query_engine()
# ---------------- FastAPI ----------------
app = FastAPI(title="LocalRAG")
class Q(BaseModel):
question: str
@app.post("/ask")
def ask(q: Q):
return {"answer": str(query_engine.query(q.question))}
# ---------------- 启动命令 ----------------
# uvicorn api:app --host 0.0.0.0 --port 8000