feat(knowledge): add Docling document parsing microservice

Add IBM Docling as a Python FastAPI microservice for high-quality document parsing with table structure recognition (TableFormer ~94% accuracy) and OCR support, replacing pdf-parse/mammoth as the primary text extractor. Architecture: - New docling-service (Python FastAPI, port 3007) in Docker network - knowledge-service calls docling-service via HTTP POST multipart/form-data - Graceful fallback: if Docling fails, falls back to pdf-parse/mammoth - Text/Markdown files skip Docling (no benefit for plain text) Changes: - New: packages/services/docling-service/ (main.py, Dockerfile, requirements.txt) - docker-compose.yml: add docling-service, wire DOCLING_SERVICE_URL to knowledge-service, add missing FILE_SERVICE_URL to conversation-service - text-extraction.service.ts: inject ConfigService, add extractViaDocling() with automatic fallback to legacy extractors - .env.example: add FILE_SERVICE_PORT/URL and DOCLING_SERVICE_PORT/URL Inter-service communication map: conversation-service → file-service (FILE_SERVICE_URL, attachments) conversation-service → knowledge-service (KNOWLEDGE_SERVICE_URL, RAG) knowledge-service → docling-service (DOCLING_SERVICE_URL, document parsing) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 05:24:10 -08:00 · 2026-02-07 05:24:10 -08:00 · 57d21526a5
parent 470ec9a64e
commit 57d21526a5
7 changed files with 244 additions and 1 deletions
--- a/.env.example
+++ b/.env.example
@ -119,6 +119,8 @@ PAYMENT_SERVICE_PORT=3002
 KNOWLEDGE_SERVICE_PORT=3003
 CONVERSATION_SERVICE_PORT=3004
 EVOLUTION_SERVICE_PORT=3005
+FILE_SERVICE_PORT=3006
+DOCLING_SERVICE_PORT=3007

 # ===========================================
 # 服务间通信 URL
@ -128,6 +130,10 @@ PAYMENT_SERVICE_URL=http://localhost:3002
 KNOWLEDGE_SERVICE_URL=http://localhost:3003
 CONVERSATION_SERVICE_URL=http://localhost:3004
 EVOLUTION_SERVICE_URL=http://localhost:3005
+# 文件服务（conversation-service 下载附件用）
+FILE_SERVICE_URL=http://localhost:3006
+# Docling 文档解析服务（PDF/DOCX 表格结构识别 + OCR）
+DOCLING_SERVICE_URL=http://localhost:3007

 # ===========================================
 # Kong API Gateway
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -250,6 +250,23 @@ services:
    networks:
      - iconsulting-network

+  docling-service:
+    build:
+      context: ./packages/services/docling-service
+      dockerfile: Dockerfile
+    container_name: iconsulting-docling
+    restart: unless-stopped
+    ports:
+      - "3007:3007"
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 120s
+    networks:
+      - iconsulting-network
+
  knowledge-service:
    build:
      context: .
@ -263,6 +280,8 @@ services:
        condition: service_healthy
      neo4j:
        condition: service_healthy
+      docling-service:
+        condition: service_healthy
    environment:
      NODE_ENV: production
      PORT: 3003
@ -278,6 +297,7 @@ services:
      NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
      OPENAI_API_KEY: ${OPENAI_API_KEY}
      OPENAI_BASE_URL: ${OPENAI_BASE_URL:-}
+      DOCLING_SERVICE_URL: http://docling-service:3007
    ports:
      - "3003:3003"
    healthcheck:
@ -315,6 +335,7 @@ services:
      ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
      ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-https://api.anthropic.com}
      KNOWLEDGE_SERVICE_URL: http://knowledge-service:3003
+      FILE_SERVICE_URL: http://file-service:3006
      CORS_ORIGINS: https://iconsulting.szaiai.com,http://localhost:5173
      JWT_SECRET: ${JWT_SECRET:-your-jwt-secret-key}
    ports:
--- a/packages/services/docling-service/Dockerfile
+++ b/packages/services/docling-service/Dockerfile
@ -0,0 +1,23 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# 系统依赖（OCR、图像处理所需的共享库）
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app/ ./app/
+
+# 构建时预下载模型，避免首次请求延迟
+RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()"
+
+EXPOSE 3007
+
+HEALTHCHECK --interval=30s --timeout=10s --retries=3 --start-period=120s \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')" || exit 1
+
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "3007"]
--- a/packages/services/docling-service/app/init.py
+++ b/packages/services/docling-service/app/init.py
--- a/packages/services/docling-service/app/main.py
+++ b/packages/services/docling-service/app/main.py
@ -0,0 +1,118 @@
+"""Docling Document Parsing Service — FastAPI
+
+IBM Docling 文档解析微服务，提供高质量的 PDF/DOCX 文本提取，
+包括表格结构识别（TableFormer）和 OCR 支持。
+"""
+
+import logging
+import os
+import re
+import tempfile
+
+from fastapi import FastAPI, File, HTTPException, UploadFile
+
+app = FastAPI(title="Docling Service", version="0.1.0")
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("docling-service")
+
+# 延迟初始化 — 首次请求时加载模型
+_converter = None
+
+
+def get_converter():
+    global _converter
+    if _converter is None:
+        from docling.document_converter import DocumentConverter
+
+        logger.info("Initializing DocumentConverter (loading models)...")
+        _converter = DocumentConverter()
+        logger.info("DocumentConverter ready")
+    return _converter
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+
+
+@app.post("/extract")
+async def extract_text(file: UploadFile = File(...)):
+    """接收文件，使用 Docling 解析后返回结构化 Markdown 文本"""
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="Missing filename")
+
+    allowed_types = {
+        "application/pdf",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "text/plain",
+        "text/markdown",
+    }
+    content_type = file.content_type or ""
+
+    # 纯文本：直接返回，无需 Docling
+    if content_type in ("text/plain", "text/markdown"):
+        raw = (await file.read()).decode("utf-8", errors="replace")
+        return {
+            "text": raw.strip(),
+            "title": _derive_title(file.filename),
+            "wordCount": _count_words(raw),
+        }
+
+    if content_type not in allowed_types:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file type: {content_type}",
+        )
+
+    # Docling 需要文件路径 — 写入临时文件
+    suffix = ".pdf" if "pdf" in content_type else ".docx"
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        tmp.write(await file.read())
+        tmp_path = tmp.name
+
+    try:
+        converter = get_converter()
+        result = converter.convert(tmp_path)
+        md_text = result.document.export_to_markdown()
+
+        page_count = None
+        if hasattr(result.document, "pages"):
+            page_count = len(result.document.pages)
+
+        logger.info(
+            "Extracted %d chars from '%s' (%s pages)",
+            len(md_text),
+            file.filename,
+            page_count or "?",
+        )
+
+        return {
+            "text": md_text.strip(),
+            "title": _derive_title(file.filename),
+            "pageCount": page_count,
+            "wordCount": _count_words(md_text),
+        }
+    except Exception as e:
+        logger.exception("Docling conversion failed for '%s'", file.filename)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Extraction failed: {str(e)}",
+        )
+    finally:
+        os.unlink(tmp_path)
+
+
+def _derive_title(filename: str) -> str:
+    """从文件名推导标题"""
+    name = os.path.splitext(filename)[0]
+    return re.sub(r"[-_]+", " ", name).strip()
+
+
+def _count_words(text: str) -> int:
+    """中英文混合字数统计"""
+    chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
+    english = len(
+        [w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w]
+    )
+    return chinese + english
--- a/packages/services/docling-service/requirements.txt
+++ b/packages/services/docling-service/requirements.txt
@ -0,0 +1,4 @@
+fastapi==0.115.0
+uvicorn[standard]==0.30.0
+python-multipart==0.0.9
+docling==2.14.0
--- a/packages/services/knowledge-service/src/application/services/text-extraction.service.ts
+++ b/packages/services/knowledge-service/src/application/services/text-extraction.service.ts
@ -1,9 +1,13 @@
 /**
 * Text Extraction Service
 * 从上传的文件中提取文本内容（PDF、Word、TXT、Markdown）
+ *
+ * 优先使用 Docling 微服务（高质量：表格结构识别 + OCR），
+ * 失败时回退到 pdf-parse / mammoth（纯文本提取）。
 */

-import { Injectable, BadRequestException, Logger } from '@nestjs/common';
+import { Injectable, BadRequestException, Logger, Optional } from '@nestjs/common';
+import { ConfigService } from '@nestjs/config';
 import { PDFParse } from 'pdf-parse';
 import * as mammoth from 'mammoth';

@ -26,6 +30,16 @@ const MAX_FILE_SIZE = 200 * 1024 * 1024; // 200MB
@Injectable()
 export class TextExtractionService {
  private readonly logger = new Logger(TextExtractionService.name);
+  private readonly doclingUrl: string | null;
+
+  constructor(@Optional() private readonly configService?: ConfigService) {
+    this.doclingUrl = this.configService?.get<string>('DOCLING_SERVICE_URL') || null;
+    if (this.doclingUrl) {
+      this.logger.log(`Docling service configured: ${this.doclingUrl}`);
+    } else {
+      this.logger.warn('DOCLING_SERVICE_URL not set — using legacy pdf-parse/mammoth');
+    }
+  }

  validateFile(file: Express.Multer.File): void {
    if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
@ -48,6 +62,18 @@ export class TextExtractionService {
      `Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`,
    );

+    // 优先使用 Docling（PDF 和 DOCX — 表格结构 + OCR）
+    if (this.doclingUrl && file.mimetype !== 'text/plain' && file.mimetype !== 'text/markdown') {
+      try {
+        return await this.extractViaDocling(file);
+      } catch (err) {
+        this.logger.warn(
+          `Docling extraction failed for "${file.originalname}", falling back to legacy: ${err}`,
+        );
+      }
+    }
+
+    // 回退到 pdf-parse / mammoth / 直接文本
    switch (file.mimetype) {
      case 'application/pdf':
        return this.extractFromPdf(file.buffer, title);
@ -61,6 +87,51 @@ export class TextExtractionService {
    }
  }

+  /**
+   * 通过 Docling 微服务提取文本（高质量）
+   * HTTP POST multipart/form-data → JSON { text, title, pageCount, wordCount }
+   */
+  private async extractViaDocling(file: Express.Multer.File): Promise<ExtractedContent> {
+    const formData = new FormData();
+    formData.append(
+      'file',
+      new Blob([file.buffer], { type: file.mimetype }),
+      file.originalname,
+    );
+
+    const response = await fetch(`${this.doclingUrl}/extract`, {
+      method: 'POST',
+      body: formData,
+    });
+
+    if (!response.ok) {
+      const errText = await response.text();
+      throw new Error(`Docling HTTP ${response.status}: ${errText}`);
+    }
+
+    const result = (await response.json()) as {
+      text: string;
+      title: string;
+      pageCount?: number;
+      wordCount: number;
+    };
+
+    this.logger.log(
+      `Docling extracted ${result.wordCount} words from "${file.originalname}"`,
+    );
+
+    return {
+      text: result.text,
+      title: result.title,
+      pageCount: result.pageCount ?? undefined,
+      wordCount: result.wordCount,
+    };
+  }
+
+  // ============================================================
+  // Legacy extractors (fallback)
+  // ============================================================
+
  private async extractFromPdf(
    buffer: Buffer,
    title: string,