iconsulting/packages/services/docling-service/app/main.py

"""Docling Document Parsing Service — FastAPI

IBM Docling 文档解析微服务，提供高质量的 PDF/DOCX 文本提取，
包括表格结构识别（TableFormer）和 OCR 支持。
"""

import logging
import os
import re
import tempfile

from fastapi import FastAPI, File, HTTPException, UploadFile

app = FastAPI(title="Docling Service", version="0.1.0")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("docling-service")

# 延迟初始化 — 首次请求时加载模型
_converter = None


def get_converter():
    global _converter
    if _converter is None:
        from docling.document_converter import DocumentConverter

        logger.info("Initializing DocumentConverter (loading models)...")
        _converter = DocumentConverter()
        logger.info("DocumentConverter ready")
    return _converter


@app.get("/health")
def health():
    return {"status": "ok"}


@app.post("/extract")
async def extract_text(file: UploadFile = File(...)):
    """接收文件，使用 Docling 解析后返回结构化 Markdown 文本"""
    if not file.filename:
        raise HTTPException(status_code=400, detail="Missing filename")

    allowed_types = {
        "application/pdf",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "text/plain",
        "text/markdown",
    }
    content_type = file.content_type or ""

    # 纯文本：直接返回，无需 Docling
    if content_type in ("text/plain", "text/markdown"):
        raw = (await file.read()).decode("utf-8", errors="replace")
        return {
            "text": raw.strip(),
            "title": _derive_title(file.filename),
            "wordCount": _count_words(raw),
        }

    if content_type not in allowed_types:
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported file type: {content_type}",
        )

    # Docling 需要文件路径 — 写入临时文件
    suffix = ".pdf" if "pdf" in content_type else ".docx"
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(await file.read())
        tmp_path = tmp.name

    try:
        converter = get_converter()
        result = converter.convert(tmp_path)
        md_text = result.document.export_to_markdown()

        page_count = None
        if hasattr(result.document, "pages"):
            page_count = len(result.document.pages)

        logger.info(
            "Extracted %d chars from '%s' (%s pages)",
            len(md_text),
            file.filename,
            page_count or "?",
        )

        return {
            "text": md_text.strip(),
            "title": _derive_title(file.filename),
            "pageCount": page_count,
            "wordCount": _count_words(md_text),
        }
    except Exception as e:
        logger.exception("Docling conversion failed for '%s'", file.filename)
        raise HTTPException(
            status_code=500,
            detail=f"Extraction failed: {str(e)}",
        )
    finally:
        os.unlink(tmp_path)


def _derive_title(filename: str) -> str:
    """从文件名推导标题"""
    name = os.path.splitext(filename)[0]
    return re.sub(r"[-_]+", " ", name).strip()


def _count_words(text: str) -> int:
    """中英文混合字数统计"""
    chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
    english = len(
        [w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w]
    )
    return chinese + english