iconsulting/packages/services/docling-service/app/main.py

119 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Docling Document Parsing Service — FastAPI
IBM Docling 文档解析微服务,提供高质量的 PDF/DOCX 文本提取,
包括表格结构识别TableFormer和 OCR 支持。
"""
import logging
import os
import re
import tempfile
from fastapi import FastAPI, File, HTTPException, UploadFile
app = FastAPI(title="Docling Service", version="0.1.0")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("docling-service")
# 延迟初始化 — 首次请求时加载模型
_converter = None
def get_converter():
global _converter
if _converter is None:
from docling.document_converter import DocumentConverter
logger.info("Initializing DocumentConverter (loading models)...")
_converter = DocumentConverter()
logger.info("DocumentConverter ready")
return _converter
@app.get("/health")
def health():
return {"status": "ok"}
@app.post("/extract")
async def extract_text(file: UploadFile = File(...)):
"""接收文件,使用 Docling 解析后返回结构化 Markdown 文本"""
if not file.filename:
raise HTTPException(status_code=400, detail="Missing filename")
allowed_types = {
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/plain",
"text/markdown",
}
content_type = file.content_type or ""
# 纯文本:直接返回,无需 Docling
if content_type in ("text/plain", "text/markdown"):
raw = (await file.read()).decode("utf-8", errors="replace")
return {
"text": raw.strip(),
"title": _derive_title(file.filename),
"wordCount": _count_words(raw),
}
if content_type not in allowed_types:
raise HTTPException(
status_code=400,
detail=f"Unsupported file type: {content_type}",
)
# Docling 需要文件路径 — 写入临时文件
suffix = ".pdf" if "pdf" in content_type else ".docx"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
try:
converter = get_converter()
result = converter.convert(tmp_path)
md_text = result.document.export_to_markdown()
page_count = None
if hasattr(result.document, "pages"):
page_count = len(result.document.pages)
logger.info(
"Extracted %d chars from '%s' (%s pages)",
len(md_text),
file.filename,
page_count or "?",
)
return {
"text": md_text.strip(),
"title": _derive_title(file.filename),
"pageCount": page_count,
"wordCount": _count_words(md_text),
}
except Exception as e:
logger.exception("Docling conversion failed for '%s'", file.filename)
raise HTTPException(
status_code=500,
detail=f"Extraction failed: {str(e)}",
)
finally:
os.unlink(tmp_path)
def _derive_title(filename: str) -> str:
"""从文件名推导标题"""
name = os.path.splitext(filename)[0]
return re.sub(r"[-_]+", " ", name).strip()
def _count_words(text: str) -> int:
"""中英文混合字数统计"""
chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
english = len(
[w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w]
)
return chinese + english