fix(docling): use standalone script for model pre-download
Inline Python one-liner had syntax errors (try/except/finally can't be single-line). Move to scripts/preload_models.py for reliable execution. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d725864cd6
commit
764613bd86
|
|
@ -11,18 +11,10 @@ COPY requirements.txt .
|
|||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app/ ./app/
|
||||
COPY scripts/ ./scripts/
|
||||
|
||||
# 构建时预下载模型(需要实际转换才会触发 HuggingFace 模型下载)
|
||||
RUN python -c "\
|
||||
import tempfile, os; \
|
||||
pdf = b'%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R/Resources<</Font<</F1 5 0 R>>>>>>endobj\n4 0 obj<</Length 44>>stream\nBT /F1 12 Tf 100 700 Td (hello) Tj ET\nendstream\nendobj\n5 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj\nxref\n0 6\n0000000000 65535 f \n0000000009 00000 n \n0000000058 00000 n \n0000000115 00000 n \n0000000266 00000 n \n0000000360 00000 n \ntrailer<</Size 6/Root 1 0 R>>\nstartxref\n454\n%%EOF'; \
|
||||
f = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False); f.write(pdf); f.close(); \
|
||||
from docling.document_converter import DocumentConverter; \
|
||||
conv = DocumentConverter(); \
|
||||
try: conv.convert(f.name) \
|
||||
except: pass \
|
||||
finally: os.unlink(f.name); \
|
||||
print('Models pre-downloaded successfully')"
|
||||
# 构建时预下载模型(实际执行一次 PDF 转换触发 HuggingFace 模型下载)
|
||||
RUN python scripts/preload_models.py
|
||||
|
||||
EXPOSE 3007
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,41 @@
|
|||
"""构建时预下载 Docling 模型到镜像缓存"""
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
# 创建最小 PDF 文件
|
||||
pdf = (
|
||||
b"%PDF-1.4\n"
|
||||
b"1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n"
|
||||
b"2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n"
|
||||
b"3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R"
|
||||
b"/Resources<</Font<</F1 5 0 R>>>>>>endobj\n"
|
||||
b"4 0 obj<</Length 44>>stream\n"
|
||||
b"BT /F1 12 Tf 100 700 Td (hello) Tj ET\n"
|
||||
b"endstream\nendobj\n"
|
||||
b"5 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj\n"
|
||||
b"xref\n0 6\n"
|
||||
b"0000000000 65535 f \n"
|
||||
b"0000000009 00000 n \n"
|
||||
b"0000000058 00000 n \n"
|
||||
b"0000000115 00000 n \n"
|
||||
b"0000000266 00000 n \n"
|
||||
b"0000000360 00000 n \n"
|
||||
b"trailer<</Size 6/Root 1 0 R>>\n"
|
||||
b"startxref\n454\n%%EOF"
|
||||
)
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
|
||||
tmp.write(pdf)
|
||||
tmp.close()
|
||||
|
||||
try:
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
conv = DocumentConverter()
|
||||
conv.convert(tmp.name)
|
||||
print("Models pre-downloaded successfully")
|
||||
except Exception as e:
|
||||
print(f"Conversion failed (expected for minimal PDF): {e}")
|
||||
print("Models should still be cached")
|
||||
finally:
|
||||
os.unlink(tmp.name)
|
||||
Loading…
Reference in New Issue