"""Docling Document Parsing Service — FastAPI IBM Docling 文档解析微服务,提供高质量的 PDF/DOCX 文本提取, 包括表格结构识别(TableFormer)和 OCR 支持。 """ import logging import os import re import tempfile from fastapi import FastAPI, File, HTTPException, UploadFile app = FastAPI(title="Docling Service", version="0.1.0") logging.basicConfig(level=logging.INFO) logger = logging.getLogger("docling-service") # 延迟初始化 — 首次请求时加载模型 _converter = None def get_converter(): global _converter if _converter is None: from docling.document_converter import DocumentConverter logger.info("Initializing DocumentConverter (loading models)...") _converter = DocumentConverter() logger.info("DocumentConverter ready") return _converter @app.get("/health") def health(): return {"status": "ok"} @app.post("/extract") async def extract_text(file: UploadFile = File(...)): """接收文件,使用 Docling 解析后返回结构化 Markdown 文本""" if not file.filename: raise HTTPException(status_code=400, detail="Missing filename") allowed_types = { "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "text/plain", "text/markdown", } content_type = file.content_type or "" # 纯文本:直接返回,无需 Docling if content_type in ("text/plain", "text/markdown"): raw = (await file.read()).decode("utf-8", errors="replace") return { "text": raw.strip(), "title": _derive_title(file.filename), "wordCount": _count_words(raw), } if content_type not in allowed_types: raise HTTPException( status_code=400, detail=f"Unsupported file type: {content_type}", ) # Docling 需要文件路径 — 写入临时文件 suffix = ".pdf" if "pdf" in content_type else ".docx" with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: tmp.write(await file.read()) tmp_path = tmp.name try: converter = get_converter() result = converter.convert(tmp_path) md_text = result.document.export_to_markdown() page_count = None if hasattr(result.document, "pages"): page_count = len(result.document.pages) logger.info( "Extracted %d chars from '%s' (%s pages)", len(md_text), file.filename, page_count or "?", ) return { "text": md_text.strip(), "title": _derive_title(file.filename), "pageCount": page_count, "wordCount": _count_words(md_text), } except Exception as e: logger.exception("Docling conversion failed for '%s'", file.filename) raise HTTPException( status_code=500, detail=f"Extraction failed: {str(e)}", ) finally: os.unlink(tmp_path) def _derive_title(filename: str) -> str: """从文件名推导标题""" name = os.path.splitext(filename)[0] return re.sub(r"[-_]+", " ", name).strip() def _count_words(text: str) -> int: """中英文混合字数统计""" chinese = len(re.findall(r"[\u4e00-\u9fff]", text)) english = len( [w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w] ) return chinese + english