119 lines
3.4 KiB
Python
119 lines
3.4 KiB
Python
"""Docling Document Parsing Service — FastAPI
|
||
|
||
IBM Docling 文档解析微服务,提供高质量的 PDF/DOCX 文本提取,
|
||
包括表格结构识别(TableFormer)和 OCR 支持。
|
||
"""
|
||
|
||
import logging
|
||
import os
|
||
import re
|
||
import tempfile
|
||
|
||
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||
|
||
app = FastAPI(title="Docling Service", version="0.1.0")
|
||
|
||
logging.basicConfig(level=logging.INFO)
|
||
logger = logging.getLogger("docling-service")
|
||
|
||
# 延迟初始化 — 首次请求时加载模型
|
||
_converter = None
|
||
|
||
|
||
def get_converter():
|
||
global _converter
|
||
if _converter is None:
|
||
from docling.document_converter import DocumentConverter
|
||
|
||
logger.info("Initializing DocumentConverter (loading models)...")
|
||
_converter = DocumentConverter()
|
||
logger.info("DocumentConverter ready")
|
||
return _converter
|
||
|
||
|
||
@app.get("/health")
|
||
def health():
|
||
return {"status": "ok"}
|
||
|
||
|
||
@app.post("/extract")
|
||
async def extract_text(file: UploadFile = File(...)):
|
||
"""接收文件,使用 Docling 解析后返回结构化 Markdown 文本"""
|
||
if not file.filename:
|
||
raise HTTPException(status_code=400, detail="Missing filename")
|
||
|
||
allowed_types = {
|
||
"application/pdf",
|
||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
"text/plain",
|
||
"text/markdown",
|
||
}
|
||
content_type = file.content_type or ""
|
||
|
||
# 纯文本:直接返回,无需 Docling
|
||
if content_type in ("text/plain", "text/markdown"):
|
||
raw = (await file.read()).decode("utf-8", errors="replace")
|
||
return {
|
||
"text": raw.strip(),
|
||
"title": _derive_title(file.filename),
|
||
"wordCount": _count_words(raw),
|
||
}
|
||
|
||
if content_type not in allowed_types:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"Unsupported file type: {content_type}",
|
||
)
|
||
|
||
# Docling 需要文件路径 — 写入临时文件
|
||
suffix = ".pdf" if "pdf" in content_type else ".docx"
|
||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||
tmp.write(await file.read())
|
||
tmp_path = tmp.name
|
||
|
||
try:
|
||
converter = get_converter()
|
||
result = converter.convert(tmp_path)
|
||
md_text = result.document.export_to_markdown()
|
||
|
||
page_count = None
|
||
if hasattr(result.document, "pages"):
|
||
page_count = len(result.document.pages)
|
||
|
||
logger.info(
|
||
"Extracted %d chars from '%s' (%s pages)",
|
||
len(md_text),
|
||
file.filename,
|
||
page_count or "?",
|
||
)
|
||
|
||
return {
|
||
"text": md_text.strip(),
|
||
"title": _derive_title(file.filename),
|
||
"pageCount": page_count,
|
||
"wordCount": _count_words(md_text),
|
||
}
|
||
except Exception as e:
|
||
logger.exception("Docling conversion failed for '%s'", file.filename)
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Extraction failed: {str(e)}",
|
||
)
|
||
finally:
|
||
os.unlink(tmp_path)
|
||
|
||
|
||
def _derive_title(filename: str) -> str:
|
||
"""从文件名推导标题"""
|
||
name = os.path.splitext(filename)[0]
|
||
return re.sub(r"[-_]+", " ", name).strip()
|
||
|
||
|
||
def _count_words(text: str) -> int:
|
||
"""中英文混合字数统计"""
|
||
chinese = len(re.findall(r"[\u4e00-\u9fff]", text))
|
||
english = len(
|
||
[w for w in re.sub(r"[\u4e00-\u9fff]", "", text).split() if w]
|
||
)
|
||
return chinese + english
|