From 764613bd86f54d64a70532ce68dd57707cce7c97 Mon Sep 17 00:00:00 2001 From: hailin Date: Sat, 7 Feb 2026 07:16:20 -0800 Subject: [PATCH] fix(docling): use standalone script for model pre-download Inline Python one-liner had syntax errors (try/except/finally can't be single-line). Move to scripts/preload_models.py for reliable execution. Co-Authored-By: Claude Opus 4.6 --- packages/services/docling-service/Dockerfile | 14 ++----- .../docling-service/scripts/preload_models.py | 41 +++++++++++++++++++ 2 files changed, 44 insertions(+), 11 deletions(-) create mode 100644 packages/services/docling-service/scripts/preload_models.py diff --git a/packages/services/docling-service/Dockerfile b/packages/services/docling-service/Dockerfile index 70d4e9d..f556e69 100644 --- a/packages/services/docling-service/Dockerfile +++ b/packages/services/docling-service/Dockerfile @@ -11,18 +11,10 @@ COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY app/ ./app/ +COPY scripts/ ./scripts/ -# 构建时预下载模型(需要实际转换才会触发 HuggingFace 模型下载) -RUN python -c "\ -import tempfile, os; \ -pdf = b'%PDF-1.4\n1 0 obj<>endobj\n2 0 obj<>endobj\n3 0 obj<>>>>>endobj\n4 0 obj<>stream\nBT /F1 12 Tf 100 700 Td (hello) Tj ET\nendstream\nendobj\n5 0 obj<>endobj\nxref\n0 6\n0000000000 65535 f \n0000000009 00000 n \n0000000058 00000 n \n0000000115 00000 n \n0000000266 00000 n \n0000000360 00000 n \ntrailer<>\nstartxref\n454\n%%EOF'; \ -f = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False); f.write(pdf); f.close(); \ -from docling.document_converter import DocumentConverter; \ -conv = DocumentConverter(); \ -try: conv.convert(f.name) \ -except: pass \ -finally: os.unlink(f.name); \ -print('Models pre-downloaded successfully')" +# 构建时预下载模型(实际执行一次 PDF 转换触发 HuggingFace 模型下载) +RUN python scripts/preload_models.py EXPOSE 3007 diff --git a/packages/services/docling-service/scripts/preload_models.py b/packages/services/docling-service/scripts/preload_models.py new file mode 100644 index 0000000..c33e9ed --- /dev/null +++ b/packages/services/docling-service/scripts/preload_models.py @@ -0,0 +1,41 @@ +"""构建时预下载 Docling 模型到镜像缓存""" +import tempfile +import os + +# 创建最小 PDF 文件 +pdf = ( + b"%PDF-1.4\n" + b"1 0 obj<>endobj\n" + b"2 0 obj<>endobj\n" + b"3 0 obj<>>>>>endobj\n" + b"4 0 obj<>stream\n" + b"BT /F1 12 Tf 100 700 Td (hello) Tj ET\n" + b"endstream\nendobj\n" + b"5 0 obj<>endobj\n" + b"xref\n0 6\n" + b"0000000000 65535 f \n" + b"0000000009 00000 n \n" + b"0000000058 00000 n \n" + b"0000000115 00000 n \n" + b"0000000266 00000 n \n" + b"0000000360 00000 n \n" + b"trailer<>\n" + b"startxref\n454\n%%EOF" +) + +tmp = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) +tmp.write(pdf) +tmp.close() + +try: + from docling.document_converter import DocumentConverter + + conv = DocumentConverter() + conv.convert(tmp.name) + print("Models pre-downloaded successfully") +except Exception as e: + print(f"Conversion failed (expected for minimal PDF): {e}") + print("Models should still be cached") +finally: + os.unlink(tmp.name)