fix: filter markdown tables, status text, residual formatting from antaf
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
49ae06ae45
commit
21998c0777
|
|
@ -40,11 +40,12 @@ class LLMProvider(LLMProviderBase):
|
|||
|
||||
@staticmethod
|
||||
def _clean_text(text):
|
||||
"""清理阿福返回文本中的脏数据、链接、markdown"""
|
||||
"""清理阿福返回文本中的脏数据、链接、markdown、表格"""
|
||||
import re
|
||||
# 去掉阿福内部状态文本
|
||||
junk = [
|
||||
"完成资料引用", "内容生成", "正在思考", "正在搜索",
|
||||
"开始获取资料", "找到资料",
|
||||
]
|
||||
for j in junk:
|
||||
text = text.replace(j, "")
|
||||
|
|
@ -52,12 +53,21 @@ class LLMProvider(LLMProviderBase):
|
|||
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
||||
# 裸URL
|
||||
text = re.sub(r'https?://\S+', '', text)
|
||||
# Markdown加粗 **文字** → 文字
|
||||
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
|
||||
# Markdown加粗 **文字** → 文字(包括空的 ****)
|
||||
text = re.sub(r'\*{2,}([^*]*)\*{2,}', r'\1', text)
|
||||
# Markdown斜体 *文字* → 文字
|
||||
text = re.sub(r'\*([^*]+)\*', r'\1', text)
|
||||
# 多余空格
|
||||
# 残留星号
|
||||
text = text.replace('*', '')
|
||||
# Markdown表格行 | xxx | xxx |
|
||||
text = re.sub(r'\|[^|]*\|[^|]*\|[^|\n]*\|?', '', text)
|
||||
# 表格对齐行 | :--- | :--- |
|
||||
text = re.sub(r'\|\s*:?-+:?\s*(\|\s*:?-+:?\s*)+\|?', '', text)
|
||||
# Markdown标题 ### 文字
|
||||
text = re.sub(r'#{1,6}\s*', '', text)
|
||||
# 多余空格和空行
|
||||
text = re.sub(r' +', ' ', text)
|
||||
text = re.sub(r'\n{2,}', '\n', text)
|
||||
return text.strip()
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
Loading…
Reference in New Issue