feat(knowledge): add Docling document parsing microservice
Add IBM Docling as a Python FastAPI microservice for high-quality document parsing with table structure recognition (TableFormer ~94% accuracy) and OCR support, replacing pdf-parse/mammoth as the primary text extractor. Architecture: - New docling-service (Python FastAPI, port 3007) in Docker network - knowledge-service calls docling-service via HTTP POST multipart/form-data - Graceful fallback: if Docling fails, falls back to pdf-parse/mammoth - Text/Markdown files skip Docling (no benefit for plain text) Changes: - New: packages/services/docling-service/ (main.py, Dockerfile, requirements.txt) - docker-compose.yml: add docling-service, wire DOCLING_SERVICE_URL to knowledge-service, add missing FILE_SERVICE_URL to conversation-service - text-extraction.service.ts: inject ConfigService, add extractViaDocling() with automatic fallback to legacy extractors - .env.example: add FILE_SERVICE_PORT/URL and DOCLING_SERVICE_PORT/URL Inter-service communication map: conversation-service → file-service (FILE_SERVICE_URL, attachments) conversation-service → knowledge-service (KNOWLEDGE_SERVICE_URL, RAG) knowledge-service → docling-service (DOCLING_SERVICE_URL, document parsing) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
470ec9a64e
commit
57d21526a5
|
|
@ -119,6 +119,8 @@ PAYMENT_SERVICE_PORT=3002
|
|||
KNOWLEDGE_SERVICE_PORT=3003
|
||||
CONVERSATION_SERVICE_PORT=3004
|
||||
EVOLUTION_SERVICE_PORT=3005
|
||||
FILE_SERVICE_PORT=3006
|
||||
DOCLING_SERVICE_PORT=3007
|
||||
|
||||
# ===========================================
|
||||
# 服务间通信 URL
|
||||
|
|
@ -128,6 +130,10 @@ PAYMENT_SERVICE_URL=http://localhost:3002
|
|||
KNOWLEDGE_SERVICE_URL=http://localhost:3003
|
||||
CONVERSATION_SERVICE_URL=http://localhost:3004
|
||||
EVOLUTION_SERVICE_URL=http://localhost:3005
|
||||
# 文件服务(conversation-service 下载附件用)
|
||||
FILE_SERVICE_URL=http://localhost:3006
|
||||
# Docling 文档解析服务(PDF/DOCX 表格结构识别 + OCR)
|
||||
DOCLING_SERVICE_URL=http://localhost:3007
|
||||
|
||||
# ===========================================
|
||||
# Kong API Gateway
|
||||
|
|
|
|||
|
|
@ -250,6 +250,23 @@ services:
|
|||
networks:
|
||||
- iconsulting-network
|
||||
|
||||
docling-service:
|
||||
build:
|
||||
context: ./packages/services/docling-service
|
||||
dockerfile: Dockerfile
|
||||
container_name: iconsulting-docling
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3007:3007"
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 120s
|
||||
networks:
|
||||
- iconsulting-network
|
||||
|
||||
knowledge-service:
|
||||
build:
|
||||
context: .
|
||||
|
|
@ -263,6 +280,8 @@ services:
|
|||
condition: service_healthy
|
||||
neo4j:
|
||||
condition: service_healthy
|
||||
docling-service:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
NODE_ENV: production
|
||||
PORT: 3003
|
||||
|
|
@ -278,6 +297,7 @@ services:
|
|||
NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
|
||||
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
||||
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-}
|
||||
DOCLING_SERVICE_URL: http://docling-service:3007
|
||||
ports:
|
||||
- "3003:3003"
|
||||
healthcheck:
|
||||
|
|
@ -315,6 +335,7 @@ services:
|
|||
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
|
||||
ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-https://api.anthropic.com}
|
||||
KNOWLEDGE_SERVICE_URL: http://knowledge-service:3003
|
||||
FILE_SERVICE_URL: http://file-service:3006
|
||||
CORS_ORIGINS: https://iconsulting.szaiai.com,http://localhost:5173
|
||||
JWT_SECRET: ${JWT_SECRET:-your-jwt-secret-key}
|
||||
ports:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,23 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 系统依赖(OCR、图像处理所需的共享库)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app/ ./app/
|
||||
|
||||
# 构建时预下载模型,避免首次请求延迟
|
||||
RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()"
|
||||
|
||||
EXPOSE 3007
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=10s --retries=3 --start-period=120s \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')" || exit 1
|
||||
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "3007"]
|
||||
|
|
@ -0,0 +1,118 @@
|
|||
"""Docling Document Parsing Service — FastAPI
|
||||
|
||||
IBM Docling 文档解析微服务,提供高质量的 PDF/DOCX 文本提取,
|
||||
包括表格结构识别(TableFormer)和 OCR 支持。
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||||
|
||||
app = FastAPI(title="Docling Service", version="0.1.0")
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("docling-service")
|
||||
|
||||
# 延迟初始化 — 首次请求时加载模型
|
||||
_converter = None
|
||||
|
||||
|
||||
def get_converter():
|
||||
global _converter
|
||||
if _converter is None:
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
logger.info("Initializing DocumentConverter (loading models)...")
|
||||
_converter = DocumentConverter()
|
||||
logger.info("DocumentConverter ready")
|
||||
return _converter
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.post("/extract")
|
||||
async def extract_text(file: UploadFile = File(...)):
|
||||
"""接收文件,使用 Docling 解析后返回结构化 Markdown 文本"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="Missing filename")
|
||||
|
||||
allowed_types = {
|
||||
"application/pdf",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"text/plain",
|
||||
"text/markdown",
|
||||
}
|
||||
content_type = file.content_type or ""
|
||||
|
||||
# 纯文本:直接返回,无需 Docling
|
||||
if content_type in ("text/plain", "text/markdown"):
|
||||
raw = (await file.read()).decode("utf-8", errors="replace")
|
||||
return {
|
||||
"text": raw.strip(),
|
||||
"title": _derive_title(file.filename),
|
||||
"wordCount": _count_words(raw),
|
||||
}
|
||||
|
||||
if content_type not in allowed_types:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported file type: {content_type}",
|
||||
)
|
||||
|
||||
# Docling 需要文件路径 — 写入临时文件
|
||||
suffix = ".pdf" if "pdf" in content_type else ".docx"
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(await file.read())
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
converter = get_converter()
|
||||
result = converter.convert(tmp_path)
|
||||
md_text = result.document.export_to_markdown()
|
||||
|
||||
page_count = None
|
||||
if hasattr(result.document, "pages"):
|
||||
page_count = len(result.document.pages)
|
||||
|
||||
logger.info(
|
||||
"Extracted %d chars from '%s' (%s pages)",
|
||||
len(md_text),
|
||||
file.filename,
|
||||
page_count or "?",
|
||||
)
|
||||
|
||||
return {
|
||||
"text": md_text.strip(),
|
||||
"title": _derive_title(file.filename),
|
||||
"pageCount": page_count,
|
||||
"wordCount": _count_words(md_text),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.exception("Docling conversion failed for '%s'", file.filename)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Extraction failed: {str(e)}",
|
||||
)
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
def _derive_title(filename: str) -> str:
|
||||
"""从文件名推导标题"""
|
||||
name = os.path.splitext(filename)[0]
|
||||
return re.sub(r"[-_]+", " ", name).strip()
|
||||
|
||||
|
||||
def _count_words(text: str) -> int:
|
||||
"""中英文混合字数统计"""
|
||||
chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
|
||||
english = len(
|
||||
[w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w]
|
||||
)
|
||||
return chinese + english
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.0
|
||||
python-multipart==0.0.9
|
||||
docling==2.14.0
|
||||
|
|
@ -1,9 +1,13 @@
|
|||
/**
|
||||
* Text Extraction Service
|
||||
* 从上传的文件中提取文本内容(PDF、Word、TXT、Markdown)
|
||||
*
|
||||
* 优先使用 Docling 微服务(高质量:表格结构识别 + OCR),
|
||||
* 失败时回退到 pdf-parse / mammoth(纯文本提取)。
|
||||
*/
|
||||
|
||||
import { Injectable, BadRequestException, Logger } from '@nestjs/common';
|
||||
import { Injectable, BadRequestException, Logger, Optional } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import { PDFParse } from 'pdf-parse';
|
||||
import * as mammoth from 'mammoth';
|
||||
|
||||
|
|
@ -26,6 +30,16 @@ const MAX_FILE_SIZE = 200 * 1024 * 1024; // 200MB
|
|||
@Injectable()
|
||||
export class TextExtractionService {
|
||||
private readonly logger = new Logger(TextExtractionService.name);
|
||||
private readonly doclingUrl: string | null;
|
||||
|
||||
constructor(@Optional() private readonly configService?: ConfigService) {
|
||||
this.doclingUrl = this.configService?.get<string>('DOCLING_SERVICE_URL') || null;
|
||||
if (this.doclingUrl) {
|
||||
this.logger.log(`Docling service configured: ${this.doclingUrl}`);
|
||||
} else {
|
||||
this.logger.warn('DOCLING_SERVICE_URL not set — using legacy pdf-parse/mammoth');
|
||||
}
|
||||
}
|
||||
|
||||
validateFile(file: Express.Multer.File): void {
|
||||
if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
|
||||
|
|
@ -48,6 +62,18 @@ export class TextExtractionService {
|
|||
`Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`,
|
||||
);
|
||||
|
||||
// 优先使用 Docling(PDF 和 DOCX — 表格结构 + OCR)
|
||||
if (this.doclingUrl && file.mimetype !== 'text/plain' && file.mimetype !== 'text/markdown') {
|
||||
try {
|
||||
return await this.extractViaDocling(file);
|
||||
} catch (err) {
|
||||
this.logger.warn(
|
||||
`Docling extraction failed for "${file.originalname}", falling back to legacy: ${err}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// 回退到 pdf-parse / mammoth / 直接文本
|
||||
switch (file.mimetype) {
|
||||
case 'application/pdf':
|
||||
return this.extractFromPdf(file.buffer, title);
|
||||
|
|
@ -61,6 +87,51 @@ export class TextExtractionService {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 通过 Docling 微服务提取文本(高质量)
|
||||
* HTTP POST multipart/form-data → JSON { text, title, pageCount, wordCount }
|
||||
*/
|
||||
private async extractViaDocling(file: Express.Multer.File): Promise<ExtractedContent> {
|
||||
const formData = new FormData();
|
||||
formData.append(
|
||||
'file',
|
||||
new Blob([file.buffer], { type: file.mimetype }),
|
||||
file.originalname,
|
||||
);
|
||||
|
||||
const response = await fetch(`${this.doclingUrl}/extract`, {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errText = await response.text();
|
||||
throw new Error(`Docling HTTP ${response.status}: ${errText}`);
|
||||
}
|
||||
|
||||
const result = (await response.json()) as {
|
||||
text: string;
|
||||
title: string;
|
||||
pageCount?: number;
|
||||
wordCount: number;
|
||||
};
|
||||
|
||||
this.logger.log(
|
||||
`Docling extracted ${result.wordCount} words from "${file.originalname}"`,
|
||||
);
|
||||
|
||||
return {
|
||||
text: result.text,
|
||||
title: result.title,
|
||||
pageCount: result.pageCount ?? undefined,
|
||||
wordCount: result.wordCount,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Legacy extractors (fallback)
|
||||
// ============================================================
|
||||
|
||||
private async extractFromPdf(
|
||||
buffer: Buffer,
|
||||
title: string,
|
||||
|
|
|
|||
Loading…
Reference in New Issue