feat(knowledge): add Docling document parsing microservice

Add IBM Docling as a Python FastAPI microservice for high-quality document
parsing with table structure recognition (TableFormer ~94% accuracy) and
OCR support, replacing pdf-parse/mammoth as the primary text extractor.

Architecture:
- New docling-service (Python FastAPI, port 3007) in Docker network
- knowledge-service calls docling-service via HTTP POST multipart/form-data
- Graceful fallback: if Docling fails, falls back to pdf-parse/mammoth
- Text/Markdown files skip Docling (no benefit for plain text)

Changes:
- New: packages/services/docling-service/ (main.py, Dockerfile, requirements.txt)
- docker-compose.yml: add docling-service, wire DOCLING_SERVICE_URL to
  knowledge-service, add missing FILE_SERVICE_URL to conversation-service
- text-extraction.service.ts: inject ConfigService, add extractViaDocling()
  with automatic fallback to legacy extractors
- .env.example: add FILE_SERVICE_PORT/URL and DOCLING_SERVICE_PORT/URL

Inter-service communication map:
  conversation-service → file-service (FILE_SERVICE_URL, attachments)
  conversation-service → knowledge-service (KNOWLEDGE_SERVICE_URL, RAG)
  knowledge-service → docling-service (DOCLING_SERVICE_URL, document parsing)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-07 05:24:10 -08:00
parent 470ec9a64e
commit 57d21526a5
7 changed files with 244 additions and 1 deletions

View File

@ -119,6 +119,8 @@ PAYMENT_SERVICE_PORT=3002
KNOWLEDGE_SERVICE_PORT=3003 KNOWLEDGE_SERVICE_PORT=3003
CONVERSATION_SERVICE_PORT=3004 CONVERSATION_SERVICE_PORT=3004
EVOLUTION_SERVICE_PORT=3005 EVOLUTION_SERVICE_PORT=3005
FILE_SERVICE_PORT=3006
DOCLING_SERVICE_PORT=3007
# =========================================== # ===========================================
# 服务间通信 URL # 服务间通信 URL
@ -128,6 +130,10 @@ PAYMENT_SERVICE_URL=http://localhost:3002
KNOWLEDGE_SERVICE_URL=http://localhost:3003 KNOWLEDGE_SERVICE_URL=http://localhost:3003
CONVERSATION_SERVICE_URL=http://localhost:3004 CONVERSATION_SERVICE_URL=http://localhost:3004
EVOLUTION_SERVICE_URL=http://localhost:3005 EVOLUTION_SERVICE_URL=http://localhost:3005
# 文件服务conversation-service 下载附件用)
FILE_SERVICE_URL=http://localhost:3006
# Docling 文档解析服务PDF/DOCX 表格结构识别 + OCR
DOCLING_SERVICE_URL=http://localhost:3007
# =========================================== # ===========================================
# Kong API Gateway # Kong API Gateway

View File

@ -250,6 +250,23 @@ services:
networks: networks:
- iconsulting-network - iconsulting-network
docling-service:
build:
context: ./packages/services/docling-service
dockerfile: Dockerfile
container_name: iconsulting-docling
restart: unless-stopped
ports:
- "3007:3007"
healthcheck:
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')"]
interval: 30s
timeout: 10s
retries: 3
start_period: 120s
networks:
- iconsulting-network
knowledge-service: knowledge-service:
build: build:
context: . context: .
@ -263,6 +280,8 @@ services:
condition: service_healthy condition: service_healthy
neo4j: neo4j:
condition: service_healthy condition: service_healthy
docling-service:
condition: service_healthy
environment: environment:
NODE_ENV: production NODE_ENV: production
PORT: 3003 PORT: 3003
@ -278,6 +297,7 @@ services:
NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123} NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
OPENAI_API_KEY: ${OPENAI_API_KEY} OPENAI_API_KEY: ${OPENAI_API_KEY}
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-} OPENAI_BASE_URL: ${OPENAI_BASE_URL:-}
DOCLING_SERVICE_URL: http://docling-service:3007
ports: ports:
- "3003:3003" - "3003:3003"
healthcheck: healthcheck:
@ -315,6 +335,7 @@ services:
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY} ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-https://api.anthropic.com} ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-https://api.anthropic.com}
KNOWLEDGE_SERVICE_URL: http://knowledge-service:3003 KNOWLEDGE_SERVICE_URL: http://knowledge-service:3003
FILE_SERVICE_URL: http://file-service:3006
CORS_ORIGINS: https://iconsulting.szaiai.com,http://localhost:5173 CORS_ORIGINS: https://iconsulting.szaiai.com,http://localhost:5173
JWT_SECRET: ${JWT_SECRET:-your-jwt-secret-key} JWT_SECRET: ${JWT_SECRET:-your-jwt-secret-key}
ports: ports:

View File

@ -0,0 +1,23 @@
FROM python:3.11-slim
WORKDIR /app
# 系统依赖OCR、图像处理所需的共享库
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY app/ ./app/
# 构建时预下载模型,避免首次请求延迟
RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()"
EXPOSE 3007
HEALTHCHECK --interval=30s --timeout=10s --retries=3 --start-period=120s \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')" || exit 1
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "3007"]

View File

@ -0,0 +1,118 @@
"""Docling Document Parsing Service — FastAPI
IBM Docling 文档解析微服务提供高质量的 PDF/DOCX 文本提取
包括表格结构识别TableFormer OCR 支持
"""
import logging
import os
import re
import tempfile
from fastapi import FastAPI, File, HTTPException, UploadFile
app = FastAPI(title="Docling Service", version="0.1.0")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("docling-service")
# 延迟初始化 — 首次请求时加载模型
_converter = None
def get_converter():
global _converter
if _converter is None:
from docling.document_converter import DocumentConverter
logger.info("Initializing DocumentConverter (loading models)...")
_converter = DocumentConverter()
logger.info("DocumentConverter ready")
return _converter
@app.get("/health")
def health():
return {"status": "ok"}
@app.post("/extract")
async def extract_text(file: UploadFile = File(...)):
"""接收文件,使用 Docling 解析后返回结构化 Markdown 文本"""
if not file.filename:
raise HTTPException(status_code=400, detail="Missing filename")
allowed_types = {
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/plain",
"text/markdown",
}
content_type = file.content_type or ""
# 纯文本:直接返回,无需 Docling
if content_type in ("text/plain", "text/markdown"):
raw = (await file.read()).decode("utf-8", errors="replace")
return {
"text": raw.strip(),
"title": _derive_title(file.filename),
"wordCount": _count_words(raw),
}
if content_type not in allowed_types:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type: {content_type}",
)
# Docling 需要文件路径 — 写入临时文件
suffix = ".pdf" if "pdf" in content_type else ".docx"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
try:
converter = get_converter()
result = converter.convert(tmp_path)
md_text = result.document.export_to_markdown()
page_count = None
if hasattr(result.document, "pages"):
page_count = len(result.document.pages)
logger.info(
"Extracted %d chars from '%s' (%s pages)",
len(md_text),
file.filename,
page_count or "?",
)
return {
"text": md_text.strip(),
"title": _derive_title(file.filename),
"pageCount": page_count,
"wordCount": _count_words(md_text),
}
except Exception as e:
logger.exception("Docling conversion failed for '%s'", file.filename)
raise HTTPException(
status_code=500,
detail=f"Extraction failed: {str(e)}",
)
finally:
os.unlink(tmp_path)
def _derive_title(filename: str) -> str:
"""从文件名推导标题"""
name = os.path.splitext(filename)[0]
return re.sub(r"[-_]+", " ", name).strip()
def _count_words(text: str) -> int:
"""中英文混合字数统计"""
chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
english = len(
[w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w]
)
return chinese + english

View File

@ -0,0 +1,4 @@
fastapi==0.115.0
uvicorn[standard]==0.30.0
python-multipart==0.0.9
docling==2.14.0

View File

@ -1,9 +1,13 @@
/** /**
* Text Extraction Service * Text Extraction Service
* PDFWordTXTMarkdown * PDFWordTXTMarkdown
*
* 使 Docling + OCR
* 退 pdf-parse / mammoth
*/ */
import { Injectable, BadRequestException, Logger } from '@nestjs/common'; import { Injectable, BadRequestException, Logger, Optional } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
import { PDFParse } from 'pdf-parse'; import { PDFParse } from 'pdf-parse';
import * as mammoth from 'mammoth'; import * as mammoth from 'mammoth';
@ -26,6 +30,16 @@ const MAX_FILE_SIZE = 200 * 1024 * 1024; // 200MB
@Injectable() @Injectable()
export class TextExtractionService { export class TextExtractionService {
private readonly logger = new Logger(TextExtractionService.name); private readonly logger = new Logger(TextExtractionService.name);
private readonly doclingUrl: string | null;
constructor(@Optional() private readonly configService?: ConfigService) {
this.doclingUrl = this.configService?.get<string>('DOCLING_SERVICE_URL') || null;
if (this.doclingUrl) {
this.logger.log(`Docling service configured: ${this.doclingUrl}`);
} else {
this.logger.warn('DOCLING_SERVICE_URL not set — using legacy pdf-parse/mammoth');
}
}
validateFile(file: Express.Multer.File): void { validateFile(file: Express.Multer.File): void {
if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) { if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
@ -48,6 +62,18 @@ export class TextExtractionService {
`Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`, `Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`,
); );
// 优先使用 DoclingPDF 和 DOCX — 表格结构 + OCR
if (this.doclingUrl && file.mimetype !== 'text/plain' && file.mimetype !== 'text/markdown') {
try {
return await this.extractViaDocling(file);
} catch (err) {
this.logger.warn(
`Docling extraction failed for "${file.originalname}", falling back to legacy: ${err}`,
);
}
}
// 回退到 pdf-parse / mammoth / 直接文本
switch (file.mimetype) { switch (file.mimetype) {
case 'application/pdf': case 'application/pdf':
return this.extractFromPdf(file.buffer, title); return this.extractFromPdf(file.buffer, title);
@ -61,6 +87,51 @@ export class TextExtractionService {
} }
} }
/**
* Docling
* HTTP POST multipart/form-data JSON { text, title, pageCount, wordCount }
*/
private async extractViaDocling(file: Express.Multer.File): Promise<ExtractedContent> {
const formData = new FormData();
formData.append(
'file',
new Blob([file.buffer], { type: file.mimetype }),
file.originalname,
);
const response = await fetch(`${this.doclingUrl}/extract`, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const errText = await response.text();
throw new Error(`Docling HTTP ${response.status}: ${errText}`);
}
const result = (await response.json()) as {
text: string;
title: string;
pageCount?: number;
wordCount: number;
};
this.logger.log(
`Docling extracted ${result.wordCount} words from "${file.originalname}"`,
);
return {
text: result.text,
title: result.title,
pageCount: result.pageCount ?? undefined,
wordCount: result.wordCount,
};
}
// ============================================================
// Legacy extractors (fallback)
// ============================================================
private async extractFromPdf( private async extractFromPdf(
buffer: Buffer, buffer: Buffer,
title: string, title: string,