feat(knowledge): add Docling document parsing microservice

Add IBM Docling as a Python FastAPI microservice for high-quality document parsing with table structure recognition (TableFormer ~94% accuracy) and OCR support, replacing pdf-parse/mammoth as the primary text extractor. Architecture: - New docling-service (Python FastAPI, port 3007) in Docker network - knowledge-service calls docling-service via HTTP POST multipart/form-data - Graceful fallback: if Docling fails, falls back to pdf-parse/mammoth - Text/Markdown files skip Docling (no benefit for plain text) Changes: - New: packages/services/docling-service/ (main.py, Dockerfile, requirements.txt) - docker-compose.yml: add docling-service, wire DOCLING_SERVICE_URL to knowledge-service, add missing FILE_SERVICE_URL to conversation-service - text-extraction.service.ts: inject ConfigService, add extractViaDocling() with automatic fallback to legacy extractors - .env.example: add FILE_SERVICE_PORT/URL and DOCLING_SERVICE_PORT/URL Inter-service communication map: conversation-service → file-service (FILE_SERVICE_URL, attachments) conversation-service → knowledge-service (KNOWLEDGE_SERVICE_URL, RAG) knowledge-service → docling-service (DOCLING_SERVICE_URL, document parsing) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 05:24:10 -08:00 · 2026-02-07 05:24:10 -08:00 · 57d21526a5
parent 470ec9a64e
commit 57d21526a5
7 changed files with 244 additions and 1 deletions
--- a/.env.example
+++ b/.env.example
@ -119,6 +119,8 @@ PAYMENT_SERVICE_PORT=3002
 KNOWLEDGE_SERVICE_PORT=3003
 CONVERSATION_SERVICE_PORT=3004
 EVOLUTION_SERVICE_PORT=3005
 FILE_SERVICE_PORT=3006
 DOCLING_SERVICE_PORT=3007
 # ===========================================
 # 服务间通信 URL
@ -128,6 +130,10 @@ PAYMENT_SERVICE_URL=http://localhost:3002
 KNOWLEDGE_SERVICE_URL=http://localhost:3003
 CONVERSATION_SERVICE_URL=http://localhost:3004
 EVOLUTION_SERVICE_URL=http://localhost:3005
 # 文件服务（conversation-service 下载附件用）
 FILE_SERVICE_URL=http://localhost:3006
 # Docling 文档解析服务（PDF/DOCX 表格结构识别 + OCR）
 DOCLING_SERVICE_URL=http://localhost:3007
 # ===========================================
 # Kong API Gateway
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -250,6 +250,23 @@ services:
    networks:
      - iconsulting-network
  docling-service:
    build:
      context: ./packages/services/docling-service
      dockerfile: Dockerfile
    container_name: iconsulting-docling
    restart: unless-stopped
    ports:
      - "3007:3007"
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s
    networks:
      - iconsulting-network
  knowledge-service:
    build:
      context: .
@ -263,6 +280,8 @@ services:
        condition: service_healthy
      neo4j:
        condition: service_healthy
      docling-service:
        condition: service_healthy
    environment:
      NODE_ENV: production
      PORT: 3003
@ -278,6 +297,7 @@ services:
      NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123}
      OPENAI_API_KEY: ${OPENAI_API_KEY}
      OPENAI_BASE_URL: ${OPENAI_BASE_URL:-}
      DOCLING_SERVICE_URL: http://docling-service:3007
    ports:
      - "3003:3003"
    healthcheck:
@ -315,6 +335,7 @@ services:
      ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
      ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-https://api.anthropic.com}
      KNOWLEDGE_SERVICE_URL: http://knowledge-service:3003
      FILE_SERVICE_URL: http://file-service:3006
      CORS_ORIGINS: https://iconsulting.szaiai.com,http://localhost:5173
      JWT_SECRET: ${JWT_SECRET:-your-jwt-secret-key}
    ports:
--- a/packages/services/docling-service/Dockerfile
+++ b/packages/services/docling-service/Dockerfile
@ -0,0 +1,23 @@
 FROM python:3.11-slim
 WORKDIR /app
 # 系统依赖（OCR、图像处理所需的共享库）
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
    && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY app/ ./app/
 # 构建时预下载模型，避免首次请求延迟
 RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()"
 EXPOSE 3007
 HEALTHCHECK --interval=30s --timeout=10s --retries=3 --start-period=120s \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')" || exit 1
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "3007"]
--- a/packages/services/docling-service/app/init.py
+++ b/packages/services/docling-service/app/init.py
--- a/packages/services/docling-service/app/main.py
+++ b/packages/services/docling-service/app/main.py
@ -0,0 +1,118 @@
 """Docling Document Parsing Service — FastAPI
 IBM Docling 文档解析微服务，提供高质量的 PDF/DOCX 文本提取，
 包括表格结构识别（TableFormer）和 OCR 支持。
 """
 import logging
 import os
 import re
 import tempfile
 from fastapi import FastAPI, File, HTTPException, UploadFile
 app = FastAPI(title="Docling Service", version="0.1.0")
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("docling-service")
 # 延迟初始化 — 首次请求时加载模型
 _converter = None
 def get_converter():
    global _converter
    if _converter is None:
        from docling.document_converter import DocumentConverter
        logger.info("Initializing DocumentConverter (loading models)...")
        _converter = DocumentConverter()
        logger.info("DocumentConverter ready")
    return _converter
@app.get("/health")
 def health():
    return {"status": "ok"}
@app.post("/extract")
 async def extract_text(file: UploadFile = File(...)):
    """接收文件，使用 Docling 解析后返回结构化 Markdown 文本"""
    if not file.filename:
        raise HTTPException(status_code=400, detail="Missing filename")
    allowed_types = {
        "application/pdf",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "text/plain",
        "text/markdown",
    }
    content_type = file.content_type or ""
    # 纯文本：直接返回，无需 Docling
    if content_type in ("text/plain", "text/markdown"):
        raw = (await file.read()).decode("utf-8", errors="replace")
        return {
            "text": raw.strip(),
            "title": _derive_title(file.filename),
            "wordCount": _count_words(raw),
        }
    if content_type not in allowed_types:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported file type: {content_type}",
        )
    # Docling 需要文件路径 — 写入临时文件
    suffix = ".pdf" if "pdf" in content_type else ".docx"
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(await file.read())
        tmp_path = tmp.name
    try:
        converter = get_converter()
        result = converter.convert(tmp_path)
        md_text = result.document.export_to_markdown()
        page_count = None
        if hasattr(result.document, "pages"):
            page_count = len(result.document.pages)
        logger.info(
            "Extracted %d chars from '%s' (%s pages)",
            len(md_text),
            file.filename,
            page_count or "?",
        )
        return {
            "text": md_text.strip(),
            "title": _derive_title(file.filename),
            "pageCount": page_count,
            "wordCount": _count_words(md_text),
        }
    except Exception as e:
        logger.exception("Docling conversion failed for '%s'", file.filename)
        raise HTTPException(
            status_code=500,
            detail=f"Extraction failed: {str(e)}",
        )
    finally:
        os.unlink(tmp_path)
 def _derive_title(filename: str) -> str:
    """从文件名推导标题"""
    name = os.path.splitext(filename)[0]
    return re.sub(r"[-_]+", " ", name).strip()
 def _count_words(text: str) -> int:
    """中英文混合字数统计"""
    chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
    english = len(
        [w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w]
    )
    return chinese + english
--- a/packages/services/docling-service/requirements.txt
+++ b/packages/services/docling-service/requirements.txt
@ -0,0 +1,4 @@
 fastapi==0.115.0
 uvicorn[standard]==0.30.0
 python-multipart==0.0.9
 docling==2.14.0
--- a/packages/services/knowledge-service/src/application/services/text-extraction.service.ts
+++ b/packages/services/knowledge-service/src/application/services/text-extraction.service.ts
@ -1,9 +1,13 @@
 /**
 * Text Extraction Service
 * 从上传的文件中提取文本内容（PDF、Word、TXT、Markdown）
 *
 * 优先使用 Docling 微服务（高质量：表格结构识别 + OCR），
 * 失败时回退到 pdf-parse / mammoth（纯文本提取）。
 */
-import { Injectable, BadRequestException, Logger } from '@nestjs/common';
+import { Injectable, BadRequestException, Logger, Optional } from '@nestjs/common';
 import { ConfigService } from '@nestjs/config';
 import { PDFParse } from 'pdf-parse';
 import * as mammoth from 'mammoth';
@ -26,6 +30,16 @@ const MAX_FILE_SIZE = 200 * 1024 * 1024; // 200MB
@Injectable()
 export class TextExtractionService {
  private readonly logger = new Logger(TextExtractionService.name);
  private readonly doclingUrl: string | null;
  constructor(@Optional() private readonly configService?: ConfigService) {
    this.doclingUrl = this.configService?.get<string>('DOCLING_SERVICE_URL') || null;
    if (this.doclingUrl) {
      this.logger.log(`Docling service configured: ${this.doclingUrl}`);
    } else {
      this.logger.warn('DOCLING_SERVICE_URL not set — using legacy pdf-parse/mammoth');
    }
  }
  validateFile(file: Express.Multer.File): void {
    if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
@ -48,6 +62,18 @@ export class TextExtractionService {
      `Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`,
    );
    // 优先使用 Docling（PDF 和 DOCX — 表格结构 + OCR）
    if (this.doclingUrl && file.mimetype !== 'text/plain' && file.mimetype !== 'text/markdown') {
      try {
        return await this.extractViaDocling(file);
      } catch (err) {
        this.logger.warn(
          `Docling extraction failed for "${file.originalname}", falling back to legacy: ${err}`,
        );
      }
    }
    // 回退到 pdf-parse / mammoth / 直接文本
    switch (file.mimetype) {
      case 'application/pdf':
        return this.extractFromPdf(file.buffer, title);
@ -61,6 +87,51 @@ export class TextExtractionService {
    }
  }
  /**
   * 通过 Docling 微服务提取文本（高质量）
   * HTTP POST multipart/form-data → JSON { text, title, pageCount, wordCount }
   */
  private async extractViaDocling(file: Express.Multer.File): Promise<ExtractedContent> {
    const formData = new FormData();
    formData.append(
      'file',
      new Blob([file.buffer], { type: file.mimetype }),
      file.originalname,
    );
    const response = await fetch(`${this.doclingUrl}/extract`, {
      method: 'POST',
      body: formData,
    });
    if (!response.ok) {
      const errText = await response.text();
      throw new Error(`Docling HTTP ${response.status}: ${errText}`);
    }
    const result = (await response.json()) as {
      text: string;
      title: string;
      pageCount?: number;
      wordCount: number;
    };
    this.logger.log(
      `Docling extracted ${result.wordCount} words from "${file.originalname}"`,
    );
    return {
      text: result.text,
      title: result.title,
      pageCount: result.pageCount ?? undefined,
      wordCount: result.wordCount,
    };
  }
  // ============================================================
  // Legacy extractors (fallback)
  // ============================================================
  private async extractFromPdf(
    buffer: Buffer,
    title: string,