作者归档:admin

CNOCR的默认识别模型

将该文件解压 并在你的root下创建一个目录 .cnocr 然后将该解压的目录 2.3 放到上面的目录下。

安装CNOCR

pip install cnocr[ort-cpu] -i https://pypi.tuna.tsinghua.edu.cn/simple
from cnocr import CnOcr

img_fp = 'page_10.png'
#ocr = CnOcr()  # 所有参数都使用默认值 
ocr = CnOcr(rec_model_name='doc-densenet_lite_136-gru')
out = ocr.ocr(img_fp)

print(out)

虚拟环境python

source ai/bin/activate
#激活ai文件夹下的虚拟环境 创建好后需要执行激活命令

deactivate
#退出当前虚拟环境


#创建虚拟环境 ,ai为当前目录下创建的文件夹
python -m venv ai


#更新
python -m pip install -U pip

OCR MinerU2.5

#UV


python -m pip install -U pip uv -i https://mirrors.aliyun.com/pypi/simple
uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple


#---



pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple

# 3. 下载模型权重(国内镜像)
 三种方法
export MINERU_MODEL_SOURCE=modelscope
#linux


$Env:MINERU_MODEL_SOURCE = "modelscope"
#windows powershell

set MINERU_MODEL_SOURCE=modelscope
#windows CMD

mineru-models-download        # 首次运行会自动生成 ~/mineru.json 配置
#会让你选择下载源 modelscope
#选择下载类型 all


# 查看版本
mineru --version
# 示例:mineru 0.9.2

# 跑一张 CPU 单线程测试
mineru -p sample.pdf -o out_dir

#out_dir下有一个 sample.md文件  可以给LLM识别 
#文本识别模型可以使用! qwen3 也可以

Saya 0.0.2 版本

MCP server交流:通过ollama 的交流模型,让其AI判断是否要调用记忆接口,然后通过

服务一:记忆接口,使用llamaindex 记忆模块

MCP负责交流所有的交流记忆信息保存为文本。 然后自动提交到llamaindex 的data 重新向量化

LLM 的设置 在mcp中使用prompt 设置相关信息

TAG

LlamaIndex 没有叫“遗忘参数”的单一旋钮,但把“人类式遗忘”拆成了 时间衰减、重要性加权、容量淘汰、手动删除 四条主线,组合后就能逼近 Ebbinghaus 曲线。

LLAMAINDEX一个就可以完成 对话记忆+文件记忆

llamaindex 放入PDF docx xlsx

安装CNORCR

pip install llama-index-readers-file pymupdf docx2txt openpyxl pandas
pip install cnocr[ort-cpu]
pip install watchdog

CNOCR下载模型,先创建脚本 自动下载:

# 在一台能上网的机器执行一次即可
from cnocr import CnOcr
_ = CnOcr()                     # 首次会自动把检测+识别模型
                                # 下载到 ~/cnocr/2.3 目录
# 把整个 ~/cnocr 文件夹拷到离线机同名路径即可

执行

# =================  依赖安装  =================
# pip install llama-index llama-index-llms-ollama llama-index-embeddings-ollama
# pip install cnocr watchdog fitz docx2txt pandas openpyxl

from pathlib import Path
import time, hashlib

from llama_index.core import Settings, Document, VectorStoreIndex
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from cnocr import CnOcr
import fitz, docx2txt, pandas as pd
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

# ================= 全局模型 =================
Settings.llm = Ollama(model="gemma:4b", request_timeout=600)
Settings.embed_model = OllamaEmbedding(model_name="embeddinggemma")

ocr = CnOcr(root="~/cnocr/2.3")   # 预下载权重目录

# ================= 文件读取器 =================
class LocalFileReader:
    def load_data(self, file_path: str):
        p = Path(file_path)
        if p.suffix.lower() == ".pdf":
            doc = fitz.open(p)
            text = ""
            for page in doc:
                out = ocr.ocr(page.get_pixmap().tobytes("png"))
                text += "\n".join(["".join(line) for line in out])
            return [Document(text=text, metadata={"file_name": p.name})]

        if p.suffix.lower() in [".docx", ".doc"]:
            txt = docx2txt.process(p)
            return [Document(text=txt, metadata={"file_name": p.name})]

        if p.suffix.lower() in [".xlsx", ".xls"]:
            dfs = pd.read_excel(p, sheet_name=None)
            txt = "\n".join(f"【{s}】\n{d.to_csv(index=False, sep='\t')}" for s, d in dfs.items())
            return [Document(text=txt, metadata={"file_name": p.name})]

        if p.suffix.lower() in [".png", ".jpg", ".jpeg"]:
            out = ocr.ocr(str(p))
            txt = "\n".join(["".join(line) for line in out])
            return [Document(text=txt, metadata={"file_name": p.name})]

        # txt / md
        from llama_index.core import SimpleDirectoryReader
        return SimpleDirectoryReader(input_files=[p]).load_data()

# ================= 工具:计算文件 hash =================
def file_hash(path: Path, chunk=8192):
    h = hashlib.md5()
    with open(path, "rb") as f:
        while chunk := f.read(chunk):
            h.update(chunk)
    return h.hexdigest()

# ================= 目录扫描 + 索引构建 =================
DATA_DIR = Path("data")
SUPPORTED = (".pdf", ".docx", ".doc", ".xlsx", ".xls", ".png", ".jpg", ".jpeg", ".txt", ".md")

def build_index():
    all_docs = []
    for p in DATA_DIR.rglob("*"):
        if p.is_file() and p.suffix.lower() in SUPPORTED:
            all_docs += LocalFileReader().load_data(p)
    return VectorStoreIndex.from_documents(all_docs)

# ================= 监视器:文件变化自动重建 =================
class DataHandler(FileSystemEventHandler):
    def __init__(self, rebuild_func):
        self.rebuild = rebuild_func

    def on_any_event(self, event):
        if event.is_directory:
            return
        if Path(event.src_path).suffix.lower() in SUPPORTED:
            print(f"[watchdog] {event.event_type} -> {event.src_path}")
            self.rebuild()

def rebuild():
    global query_engine
    print("[rebuild] 正在重新索引...")
    new_index = build_index()
    query_engine = new_index.as_query_engine()
    print("[rebuild] 完成!")

# ================= 首次建库 =================
query_engine = build_index().as_query_engine()

# ================= 启动监视 =================
observer = Observer()
observer.schedule(DataHandler(rebuild), str(DATA_DIR), recursive=True)
observer.start()

# ================= 交互 / 测试 =================
if __name__ == "__main__":
    try:
        while True:
            q = input("\n请输入问题(q 退出):").strip()
            if q.lower() == "q":
                break
            resp = query_engine.query(q)
            print(resp)
    except KeyboardInterrupt:
        pass
    finally:
        observer.stop()
        observer.join()

FASTAPI 开放API

from pathlib import Path
from fastapi import FastAPI
from pydantic import BaseModel

from llama_index.core import Settings, Document, VectorStoreIndex
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from cnocr import CnOcr
import fitz, docx2txt, pandas as pd

# ---------------- 模型 ----------------
Settings.llm      = Ollama(model="gemma:4b", request_timeout=600)
Settings.embed_model = OllamaEmbedding(model_name="embeddinggemma")
ocr = CnOcr(root="~/cnocr/2.3")

# ---------------- 文件读取器 ----------------
class LocalFileReader:
    def load_data(self, file_path: str):
        p = Path(file_path)
        if p.suffix.lower() == ".pdf":
            doc = fitz.open(p)
            text = ""
            for page in doc:
                out = ocr.ocr(page.get_pixmap().tobytes("png"))
                text += "\n".join(["".join(line) for line in out])
            return [Document(text=text, metadata={"file_name": p.name})]

        if p.suffix.lower() in [".docx", ".doc"]:
            txt = docx2txt.process(p)
            return [Document(text=txt, metadata={"file_name": p.name})]

        if p.suffix.lower() in [".xlsx", ".xls"]:
            dfs = pd.read_excel(p, sheet_name=None)
            txt = "\n".join(f"【{s}】\n{d.to_csv(index=False, sep='\t')}" for s, d in dfs.items())
            return [Document(text=txt, metadata={"file_name": p.name})]

        if p.suffix.lower() in [".png", ".jpg", ".jpeg"]:
            out = ocr.ocr(str(p))
            txt = "\n".join(["".join(line) for line in out])
            return [Document(text=txt, metadata={"file_name": p.name})]

        from llama_index.core import SimpleDirectoryReader
        return SimpleDirectoryReader(input_files=[p]).load_data()

# ---------------- 建索引(启动时一次) ----------------
DATA_DIR = Path("data")
SUPPORTED = (".pdf", ".docx", ".doc", ".xlsx", ".xls", ".png", ".jpg", ".jpeg", ".txt", ".md")

all_docs = []
for p in DATA_DIR.rglob("*"):
    if p.is_file() and p.suffix.lower() in SUPPORTED:
        all_docs += LocalFileReader().load_data(p)

index = VectorStoreIndex.from_documents(all_docs)
query_engine = index.as_query_engine()

# ---------------- FastAPI ----------------
app = FastAPI(title="LocalRAG")

class Q(BaseModel):
    question: str

@app.post("/ask")
def ask(q: Q):
    return {"answer": str(query_engine.query(q.question))}

# ---------------- 启动命令 ----------------
# uvicorn api:app --host 0.0.0.0 --port 8000