feat(knowledge): add Docling document parsing microservice
Add IBM Docling as a Python FastAPI microservice for high-quality document parsing with table structure recognition (TableFormer ~94% accuracy) and OCR support, replacing pdf-parse/mammoth as the primary text extractor. Architecture: - New docling-service (Python FastAPI, port 3007) in Docker network - knowledge-service calls docling-service via HTTP POST multipart/form-data - Graceful fallback: if Docling fails, falls back to pdf-parse/mammoth - Text/Markdown files skip Docling (no benefit for plain text) Changes: - New: packages/services/docling-service/ (main.py, Dockerfile, requirements.txt) - docker-compose.yml: add docling-service, wire DOCLING_SERVICE_URL to knowledge-service, add missing FILE_SERVICE_URL to conversation-service - text-extraction.service.ts: inject ConfigService, add extractViaDocling() with automatic fallback to legacy extractors - .env.example: add FILE_SERVICE_PORT/URL and DOCLING_SERVICE_PORT/URL Inter-service communication map: conversation-service → file-service (FILE_SERVICE_URL, attachments) conversation-service → knowledge-service (KNOWLEDGE_SERVICE_URL, RAG) knowledge-service → docling-service (DOCLING_SERVICE_URL, document parsing) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
470ec9a64e
commit
57d21526a5
|
|
@ -119,6 +119,8 @@ PAYMENT_SERVICE_PORT=3002
|
||||||
KNOWLEDGE_SERVICE_PORT=3003
|
KNOWLEDGE_SERVICE_PORT=3003
|
||||||
CONVERSATION_SERVICE_PORT=3004
|
CONVERSATION_SERVICE_PORT=3004
|
||||||
EVOLUTION_SERVICE_PORT=3005
|
EVOLUTION_SERVICE_PORT=3005
|
||||||
|
FILE_SERVICE_PORT=3006
|
||||||
|
DOCLING_SERVICE_PORT=3007
|
||||||
|
|
||||||
# ===========================================
|
# ===========================================
|
||||||
# 服务间通信 URL
|
# 服务间通信 URL
|
||||||
|
|
@ -128,6 +130,10 @@ PAYMENT_SERVICE_URL=http://localhost:3002
|
||||||
KNOWLEDGE_SERVICE_URL=http://localhost:3003
|
KNOWLEDGE_SERVICE_URL=http://localhost:3003
|
||||||
CONVERSATION_SERVICE_URL=http://localhost:3004
|
CONVERSATION_SERVICE_URL=http://localhost:3004
|
||||||
EVOLUTION_SERVICE_URL=http://localhost:3005
|
EVOLUTION_SERVICE_URL=http://localhost:3005
|
||||||
|
# 文件服务(conversation-service 下载附件用)
|
||||||
|
FILE_SERVICE_URL=http://localhost:3006
|
||||||
|
# Docling 文档解析服务(PDF/DOCX 表格结构识别 + OCR)
|
||||||
|
DOCLING_SERVICE_URL=http://localhost:3007
|
||||||
|
|
||||||
# ===========================================
|
# ===========================================
|
||||||
# Kong API Gateway
|
# Kong API Gateway
|
||||||
|
|
|
||||||
|
|
@ -250,6 +250,23 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- iconsulting-network
|
- iconsulting-network
|
||||||
|
|
||||||
|
docling-service:
|
||||||
|
build:
|
||||||
|
context: ./packages/services/docling-service
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: iconsulting-docling
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "3007:3007"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 120s
|
||||||
|
networks:
|
||||||
|
- iconsulting-network
|
||||||
|
|
||||||
knowledge-service:
|
knowledge-service:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
|
|
@ -263,6 +280,8 @@ services:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
neo4j:
|
neo4j:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
docling-service:
|
||||||
|
condition: service_healthy
|
||||||
environment:
|
environment:
|
||||||
NODE_ENV: production
|
NODE_ENV: production
|
||||||
PORT: 3003
|
PORT: 3003
|
||||||
|
|
@ -278,6 +297,7 @@ services:
|
||||||
NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
|
NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
|
||||||
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
OPENAI_API_KEY: ${OPENAI_API_KEY}
|
||||||
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-}
|
OPENAI_BASE_URL: ${OPENAI_BASE_URL:-}
|
||||||
|
DOCLING_SERVICE_URL: http://docling-service:3007
|
||||||
ports:
|
ports:
|
||||||
- "3003:3003"
|
- "3003:3003"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
|
|
@ -315,6 +335,7 @@ services:
|
||||||
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
|
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
|
||||||
ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-https://api.anthropic.com}
|
ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-https://api.anthropic.com}
|
||||||
KNOWLEDGE_SERVICE_URL: http://knowledge-service:3003
|
KNOWLEDGE_SERVICE_URL: http://knowledge-service:3003
|
||||||
|
FILE_SERVICE_URL: http://file-service:3006
|
||||||
CORS_ORIGINS: https://iconsulting.szaiai.com,http://localhost:5173
|
CORS_ORIGINS: https://iconsulting.szaiai.com,http://localhost:5173
|
||||||
JWT_SECRET: ${JWT_SECRET:-your-jwt-secret-key}
|
JWT_SECRET: ${JWT_SECRET:-your-jwt-secret-key}
|
||||||
ports:
|
ports:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 系统依赖(OCR、图像处理所需的共享库)
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY app/ ./app/
|
||||||
|
|
||||||
|
# 构建时预下载模型,避免首次请求延迟
|
||||||
|
RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()"
|
||||||
|
|
||||||
|
EXPOSE 3007
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --retries=3 --start-period=120s \
|
||||||
|
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')" || exit 1
|
||||||
|
|
||||||
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "3007"]
|
||||||
|
|
@ -0,0 +1,118 @@
|
||||||
|
"""Docling Document Parsing Service — FastAPI
|
||||||
|
|
||||||
|
IBM Docling 文档解析微服务,提供高质量的 PDF/DOCX 文本提取,
|
||||||
|
包括表格结构识别(TableFormer)和 OCR 支持。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||||||
|
|
||||||
|
app = FastAPI(title="Docling Service", version="0.1.0")
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger("docling-service")
|
||||||
|
|
||||||
|
# 延迟初始化 — 首次请求时加载模型
|
||||||
|
_converter = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_converter():
|
||||||
|
global _converter
|
||||||
|
if _converter is None:
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
logger.info("Initializing DocumentConverter (loading models)...")
|
||||||
|
_converter = DocumentConverter()
|
||||||
|
logger.info("DocumentConverter ready")
|
||||||
|
return _converter
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/extract")
|
||||||
|
async def extract_text(file: UploadFile = File(...)):
|
||||||
|
"""接收文件,使用 Docling 解析后返回结构化 Markdown 文本"""
|
||||||
|
if not file.filename:
|
||||||
|
raise HTTPException(status_code=400, detail="Missing filename")
|
||||||
|
|
||||||
|
allowed_types = {
|
||||||
|
"application/pdf",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"text/plain",
|
||||||
|
"text/markdown",
|
||||||
|
}
|
||||||
|
content_type = file.content_type or ""
|
||||||
|
|
||||||
|
# 纯文本:直接返回,无需 Docling
|
||||||
|
if content_type in ("text/plain", "text/markdown"):
|
||||||
|
raw = (await file.read()).decode("utf-8", errors="replace")
|
||||||
|
return {
|
||||||
|
"text": raw.strip(),
|
||||||
|
"title": _derive_title(file.filename),
|
||||||
|
"wordCount": _count_words(raw),
|
||||||
|
}
|
||||||
|
|
||||||
|
if content_type not in allowed_types:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Unsupported file type: {content_type}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Docling 需要文件路径 — 写入临时文件
|
||||||
|
suffix = ".pdf" if "pdf" in content_type else ".docx"
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||||
|
tmp.write(await file.read())
|
||||||
|
tmp_path = tmp.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
converter = get_converter()
|
||||||
|
result = converter.convert(tmp_path)
|
||||||
|
md_text = result.document.export_to_markdown()
|
||||||
|
|
||||||
|
page_count = None
|
||||||
|
if hasattr(result.document, "pages"):
|
||||||
|
page_count = len(result.document.pages)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Extracted %d chars from '%s' (%s pages)",
|
||||||
|
len(md_text),
|
||||||
|
file.filename,
|
||||||
|
page_count or "?",
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text": md_text.strip(),
|
||||||
|
"title": _derive_title(file.filename),
|
||||||
|
"pageCount": page_count,
|
||||||
|
"wordCount": _count_words(md_text),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Docling conversion failed for '%s'", file.filename)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Extraction failed: {str(e)}",
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
|
||||||
|
def _derive_title(filename: str) -> str:
|
||||||
|
"""从文件名推导标题"""
|
||||||
|
name = os.path.splitext(filename)[0]
|
||||||
|
return re.sub(r"[-_]+", " ", name).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _count_words(text: str) -> int:
|
||||||
|
"""中英文混合字数统计"""
|
||||||
|
chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
|
||||||
|
english = len(
|
||||||
|
[w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w]
|
||||||
|
)
|
||||||
|
return chinese + english
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
fastapi==0.115.0
|
||||||
|
uvicorn[standard]==0.30.0
|
||||||
|
python-multipart==0.0.9
|
||||||
|
docling==2.14.0
|
||||||
|
|
@ -1,9 +1,13 @@
|
||||||
/**
|
/**
|
||||||
* Text Extraction Service
|
* Text Extraction Service
|
||||||
* 从上传的文件中提取文本内容(PDF、Word、TXT、Markdown)
|
* 从上传的文件中提取文本内容(PDF、Word、TXT、Markdown)
|
||||||
|
*
|
||||||
|
* 优先使用 Docling 微服务(高质量:表格结构识别 + OCR),
|
||||||
|
* 失败时回退到 pdf-parse / mammoth(纯文本提取)。
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { Injectable, BadRequestException, Logger } from '@nestjs/common';
|
import { Injectable, BadRequestException, Logger, Optional } from '@nestjs/common';
|
||||||
|
import { ConfigService } from '@nestjs/config';
|
||||||
import { PDFParse } from 'pdf-parse';
|
import { PDFParse } from 'pdf-parse';
|
||||||
import * as mammoth from 'mammoth';
|
import * as mammoth from 'mammoth';
|
||||||
|
|
||||||
|
|
@ -26,6 +30,16 @@ const MAX_FILE_SIZE = 200 * 1024 * 1024; // 200MB
|
||||||
@Injectable()
|
@Injectable()
|
||||||
export class TextExtractionService {
|
export class TextExtractionService {
|
||||||
private readonly logger = new Logger(TextExtractionService.name);
|
private readonly logger = new Logger(TextExtractionService.name);
|
||||||
|
private readonly doclingUrl: string | null;
|
||||||
|
|
||||||
|
constructor(@Optional() private readonly configService?: ConfigService) {
|
||||||
|
this.doclingUrl = this.configService?.get<string>('DOCLING_SERVICE_URL') || null;
|
||||||
|
if (this.doclingUrl) {
|
||||||
|
this.logger.log(`Docling service configured: ${this.doclingUrl}`);
|
||||||
|
} else {
|
||||||
|
this.logger.warn('DOCLING_SERVICE_URL not set — using legacy pdf-parse/mammoth');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
validateFile(file: Express.Multer.File): void {
|
validateFile(file: Express.Multer.File): void {
|
||||||
if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
|
if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
|
||||||
|
|
@ -48,6 +62,18 @@ export class TextExtractionService {
|
||||||
`Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`,
|
`Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// 优先使用 Docling(PDF 和 DOCX — 表格结构 + OCR)
|
||||||
|
if (this.doclingUrl && file.mimetype !== 'text/plain' && file.mimetype !== 'text/markdown') {
|
||||||
|
try {
|
||||||
|
return await this.extractViaDocling(file);
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.warn(
|
||||||
|
`Docling extraction failed for "${file.originalname}", falling back to legacy: ${err}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 回退到 pdf-parse / mammoth / 直接文本
|
||||||
switch (file.mimetype) {
|
switch (file.mimetype) {
|
||||||
case 'application/pdf':
|
case 'application/pdf':
|
||||||
return this.extractFromPdf(file.buffer, title);
|
return this.extractFromPdf(file.buffer, title);
|
||||||
|
|
@ -61,6 +87,51 @@ export class TextExtractionService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 通过 Docling 微服务提取文本(高质量)
|
||||||
|
* HTTP POST multipart/form-data → JSON { text, title, pageCount, wordCount }
|
||||||
|
*/
|
||||||
|
private async extractViaDocling(file: Express.Multer.File): Promise<ExtractedContent> {
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append(
|
||||||
|
'file',
|
||||||
|
new Blob([file.buffer], { type: file.mimetype }),
|
||||||
|
file.originalname,
|
||||||
|
);
|
||||||
|
|
||||||
|
const response = await fetch(`${this.doclingUrl}/extract`, {
|
||||||
|
method: 'POST',
|
||||||
|
body: formData,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errText = await response.text();
|
||||||
|
throw new Error(`Docling HTTP ${response.status}: ${errText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = (await response.json()) as {
|
||||||
|
text: string;
|
||||||
|
title: string;
|
||||||
|
pageCount?: number;
|
||||||
|
wordCount: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
this.logger.log(
|
||||||
|
`Docling extracted ${result.wordCount} words from "${file.originalname}"`,
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
text: result.text,
|
||||||
|
title: result.title,
|
||||||
|
pageCount: result.pageCount ?? undefined,
|
||||||
|
wordCount: result.wordCount,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// Legacy extractors (fallback)
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
private async extractFromPdf(
|
private async extractFromPdf(
|
||||||
buffer: Buffer,
|
buffer: Buffer,
|
||||||
title: string,
|
title: string,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue