diff --git a/.env.example b/.env.example index e7672f9..902e9f7 100644 --- a/.env.example +++ b/.env.example @@ -119,6 +119,8 @@ PAYMENT_SERVICE_PORT=3002 KNOWLEDGE_SERVICE_PORT=3003 CONVERSATION_SERVICE_PORT=3004 EVOLUTION_SERVICE_PORT=3005 +FILE_SERVICE_PORT=3006 +DOCLING_SERVICE_PORT=3007 # =========================================== # 服务间通信 URL @@ -128,6 +130,10 @@ PAYMENT_SERVICE_URL=http://localhost:3002 KNOWLEDGE_SERVICE_URL=http://localhost:3003 CONVERSATION_SERVICE_URL=http://localhost:3004 EVOLUTION_SERVICE_URL=http://localhost:3005 +# 文件服务(conversation-service 下载附件用) +FILE_SERVICE_URL=http://localhost:3006 +# Docling 文档解析服务(PDF/DOCX 表格结构识别 + OCR) +DOCLING_SERVICE_URL=http://localhost:3007 # =========================================== # Kong API Gateway diff --git a/docker-compose.yml b/docker-compose.yml index b197166..2b057d2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -250,6 +250,23 @@ services: networks: - iconsulting-network + docling-service: + build: + context: ./packages/services/docling-service + dockerfile: Dockerfile + container_name: iconsulting-docling + restart: unless-stopped + ports: + - "3007:3007" + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 120s + networks: + - iconsulting-network + knowledge-service: build: context: . @@ -263,6 +280,8 @@ services: condition: service_healthy neo4j: condition: service_healthy + docling-service: + condition: service_healthy environment: NODE_ENV: production PORT: 3003 @@ -278,6 +297,7 @@ services: NEO4J_PASSWORD: ${NEO4J_PASSWORD:-neo4j123} OPENAI_API_KEY: ${OPENAI_API_KEY} OPENAI_BASE_URL: ${OPENAI_BASE_URL:-} + DOCLING_SERVICE_URL: http://docling-service:3007 ports: - "3003:3003" healthcheck: @@ -315,6 +335,7 @@ services: ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY} ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-https://api.anthropic.com} KNOWLEDGE_SERVICE_URL: http://knowledge-service:3003 + FILE_SERVICE_URL: http://file-service:3006 CORS_ORIGINS: https://iconsulting.szaiai.com,http://localhost:5173 JWT_SECRET: ${JWT_SECRET:-your-jwt-secret-key} ports: diff --git a/packages/services/docling-service/Dockerfile b/packages/services/docling-service/Dockerfile new file mode 100644 index 0000000..8c56aa2 --- /dev/null +++ b/packages/services/docling-service/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.11-slim + +WORKDIR /app + +# 系统依赖(OCR、图像处理所需的共享库) +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app/ ./app/ + +# 构建时预下载模型,避免首次请求延迟 +RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()" + +EXPOSE 3007 + +HEALTHCHECK --interval=30s --timeout=10s --retries=3 --start-period=120s \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:3007/health')" || exit 1 + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "3007"] diff --git a/packages/services/docling-service/app/__init__.py b/packages/services/docling-service/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/services/docling-service/app/main.py b/packages/services/docling-service/app/main.py new file mode 100644 index 0000000..f36ec37 --- /dev/null +++ b/packages/services/docling-service/app/main.py @@ -0,0 +1,118 @@ +"""Docling Document Parsing Service — FastAPI + +IBM Docling 文档解析微服务,提供高质量的 PDF/DOCX 文本提取, +包括表格结构识别(TableFormer)和 OCR 支持。 +""" + +import logging +import os +import re +import tempfile + +from fastapi import FastAPI, File, HTTPException, UploadFile + +app = FastAPI(title="Docling Service", version="0.1.0") + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("docling-service") + +# 延迟初始化 — 首次请求时加载模型 +_converter = None + + +def get_converter(): + global _converter + if _converter is None: + from docling.document_converter import DocumentConverter + + logger.info("Initializing DocumentConverter (loading models)...") + _converter = DocumentConverter() + logger.info("DocumentConverter ready") + return _converter + + +@app.get("/health") +def health(): + return {"status": "ok"} + + +@app.post("/extract") +async def extract_text(file: UploadFile = File(...)): + """接收文件,使用 Docling 解析后返回结构化 Markdown 文本""" + if not file.filename: + raise HTTPException(status_code=400, detail="Missing filename") + + allowed_types = { + "application/pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "text/plain", + "text/markdown", + } + content_type = file.content_type or "" + + # 纯文本:直接返回,无需 Docling + if content_type in ("text/plain", "text/markdown"): + raw = (await file.read()).decode("utf-8", errors="replace") + return { + "text": raw.strip(), + "title": _derive_title(file.filename), + "wordCount": _count_words(raw), + } + + if content_type not in allowed_types: + raise HTTPException( + status_code=400, + detail=f"Unsupported file type: {content_type}", + ) + + # Docling 需要文件路径 — 写入临时文件 + suffix = ".pdf" if "pdf" in content_type else ".docx" + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(await file.read()) + tmp_path = tmp.name + + try: + converter = get_converter() + result = converter.convert(tmp_path) + md_text = result.document.export_to_markdown() + + page_count = None + if hasattr(result.document, "pages"): + page_count = len(result.document.pages) + + logger.info( + "Extracted %d chars from '%s' (%s pages)", + len(md_text), + file.filename, + page_count or "?", + ) + + return { + "text": md_text.strip(), + "title": _derive_title(file.filename), + "pageCount": page_count, + "wordCount": _count_words(md_text), + } + except Exception as e: + logger.exception("Docling conversion failed for '%s'", file.filename) + raise HTTPException( + status_code=500, + detail=f"Extraction failed: {str(e)}", + ) + finally: + os.unlink(tmp_path) + + +def _derive_title(filename: str) -> str: + """从文件名推导标题""" + name = os.path.splitext(filename)[0] + return re.sub(r"[-_]+", " ", name).strip() + + +def _count_words(text: str) -> int: + """中英文混合字数统计""" + chinese = len(re.findall(r"[\u4e00-\u9fff]", text)) + english = len( + [w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w] + ) + return chinese + english diff --git a/packages/services/docling-service/requirements.txt b/packages/services/docling-service/requirements.txt new file mode 100644 index 0000000..e5f97b0 --- /dev/null +++ b/packages/services/docling-service/requirements.txt @@ -0,0 +1,4 @@ +fastapi==0.115.0 +uvicorn[standard]==0.30.0 +python-multipart==0.0.9 +docling==2.14.0 diff --git a/packages/services/knowledge-service/src/application/services/text-extraction.service.ts b/packages/services/knowledge-service/src/application/services/text-extraction.service.ts index 41e9821..a0e183b 100644 --- a/packages/services/knowledge-service/src/application/services/text-extraction.service.ts +++ b/packages/services/knowledge-service/src/application/services/text-extraction.service.ts @@ -1,9 +1,13 @@ /** * Text Extraction Service * 从上传的文件中提取文本内容(PDF、Word、TXT、Markdown) + * + * 优先使用 Docling 微服务(高质量:表格结构识别 + OCR), + * 失败时回退到 pdf-parse / mammoth(纯文本提取)。 */ -import { Injectable, BadRequestException, Logger } from '@nestjs/common'; +import { Injectable, BadRequestException, Logger, Optional } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; import { PDFParse } from 'pdf-parse'; import * as mammoth from 'mammoth'; @@ -26,6 +30,16 @@ const MAX_FILE_SIZE = 200 * 1024 * 1024; // 200MB @Injectable() export class TextExtractionService { private readonly logger = new Logger(TextExtractionService.name); + private readonly doclingUrl: string | null; + + constructor(@Optional() private readonly configService?: ConfigService) { + this.doclingUrl = this.configService?.get('DOCLING_SERVICE_URL') || null; + if (this.doclingUrl) { + this.logger.log(`Docling service configured: ${this.doclingUrl}`); + } else { + this.logger.warn('DOCLING_SERVICE_URL not set — using legacy pdf-parse/mammoth'); + } + } validateFile(file: Express.Multer.File): void { if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) { @@ -48,6 +62,18 @@ export class TextExtractionService { `Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`, ); + // 优先使用 Docling(PDF 和 DOCX — 表格结构 + OCR) + if (this.doclingUrl && file.mimetype !== 'text/plain' && file.mimetype !== 'text/markdown') { + try { + return await this.extractViaDocling(file); + } catch (err) { + this.logger.warn( + `Docling extraction failed for "${file.originalname}", falling back to legacy: ${err}`, + ); + } + } + + // 回退到 pdf-parse / mammoth / 直接文本 switch (file.mimetype) { case 'application/pdf': return this.extractFromPdf(file.buffer, title); @@ -61,6 +87,51 @@ export class TextExtractionService { } } + /** + * 通过 Docling 微服务提取文本(高质量) + * HTTP POST multipart/form-data → JSON { text, title, pageCount, wordCount } + */ + private async extractViaDocling(file: Express.Multer.File): Promise { + const formData = new FormData(); + formData.append( + 'file', + new Blob([file.buffer], { type: file.mimetype }), + file.originalname, + ); + + const response = await fetch(`${this.doclingUrl}/extract`, { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + const errText = await response.text(); + throw new Error(`Docling HTTP ${response.status}: ${errText}`); + } + + const result = (await response.json()) as { + text: string; + title: string; + pageCount?: number; + wordCount: number; + }; + + this.logger.log( + `Docling extracted ${result.wordCount} words from "${file.originalname}"`, + ); + + return { + text: result.text, + title: result.title, + pageCount: result.pageCount ?? undefined, + wordCount: result.wordCount, + }; + } + + // ============================================================ + // Legacy extractors (fallback) + // ============================================================ + private async extractFromPdf( buffer: Buffer, title: string,