129 lines
9.4 KiB
Python
129 lines
9.4 KiB
Python
import re
|
|
|
|
|
|
def clean_content(content: str):
|
|
if content is None:
|
|
raise ValueError("content is None.")
|
|
|
|
content = content.split('</think>')[-1].strip('\n').strip()
|
|
|
|
if content.startswith('\"') and content.endswith('\"'):
|
|
content = content[1:-1]
|
|
|
|
if content.startswith("```\n") and content.endswith("\n```"):
|
|
content = content[4:-4]
|
|
|
|
return content
|
|
|
|
|
|
def clean_code(code: str, lang: str, length_threshold: int = 30) -> str:
|
|
cleaned_code = code.strip('\ufeff').strip()
|
|
if not cleaned_code:
|
|
return ''
|
|
|
|
def clean_empty_lines(text: str) -> str:
|
|
return re.sub(r'\n\s*\n', '\n', text).strip()
|
|
|
|
# 各语言函数/类定义检测正则表达式
|
|
function_patterns = {
|
|
"java": r"(?m)^(?!\s*(import|package)\b).*\b(public\s+class|class\s+\w+|void\s+main|new\s+\w+\(|@Override)\b",
|
|
"python": r"(?m)^(?!\s*(import|from\s+\S+\s+import)\b).*\b(def\s+\w+|class\s+\w+|=\s*\S+|if\s+[:\w]|print\s+)",
|
|
"javascript": r"(?m)^(?!\s*(import|require\(|export\s)).*\b(function\s+\w+|const\s+\w+|=>|\(\)\s*=>|console\.log)",
|
|
"php": r"(?m)^(?!\s*(include|require|use)\b).*\b(function\s+\w+|echo\s+\S+|class\s+\w+)",
|
|
"ruby": r"(?m)^(?!\s*(require|load)\b).*\b(class\s+\w+|def\s+\w+|puts\s+\S+)",
|
|
"go": r"(?m)^(?!\s*import\b).*\bfunc\s+main\s*\(|type\s+\w+\s+struct",
|
|
"c#": r"(?m)^(?!\s*using\b).*\b(class\s+\w+|void\s+Main\s*\()",
|
|
"cplusplus": r"(?m)^(?!#include\b).*\b(int\s+main\s*\(|class\s+\w+|void\s+\w+\s*\(.*\)\s*{)",
|
|
"c": r"(?m)^(?!#include\b).*\b(int\s+main\s*\(|void\s+\w+\s*\(.*\)\s*{)",
|
|
"rust": r"(?m)^(?!\s*use\b).*\b(fn\s+main\s*\(|struct\s+\w+|impl\s+\w+)",
|
|
"typescript": r"(?m)^(?!\s*(import|require\(|export\s)).*\b(interface\s+\w+|class\s+\w+|function\s+\w+)",
|
|
"perl": r"(?m)^(?!\s*(use|require)\b).*\b(sub\s+\w+|my\s+\$\w+|print\s+\S+)",
|
|
"shell": r"(?m)^(?!\s*(source|\.)\s).*\b(function\s+\w+|if\s+\[|\$\(|echo\s+\S+)",
|
|
"sql": r"(?i)\b(CREATE\s+TABLE|SELECT\s+\*|INSERT\s+INTO|UPDATE\s+\w+|DELETE\s+FROM)\b",
|
|
"batchfile": r"(?m)^(?!\s*@?call\b).*\b(echo\s+\S+|set\s+\w+|if\s+.*\s+==\s+)",
|
|
"fortran": r"(?mi)^(?!\s*use\b).*\b(program\s+\w+|subroutine\s+\w+|do\s+\d+\s*,\s*\d+)",
|
|
"haskell": r"(?m)^(?!\s*import\b).*\b(main\s*=\s*do|data\s+\w+|putStrLn\s+\S+)",
|
|
"lua": r"(?m)^(?!\s*require\b).*\b(function\s+\w+|local\s+\w+|print\s*\()",
|
|
"powershell": r"(?m)^(?!\s*Import-Module\b).*\b(function\s+\w+|Write-Host\s+\S+|\$\w+\s*=)",
|
|
"visual_basic": r"(?m)^(?!\s*Imports\b).*\b(Module\s+\w+|Sub\s+Main|Class\s+\w+)"
|
|
}
|
|
|
|
# 各语言注释处理规则
|
|
comment_patterns = {
|
|
'java': (r'//.*?$|/\*.*?\*/|\\/\\/.*?$|\\/\*.*?\*\\/', re.DOTALL | re.MULTILINE),
|
|
'python': (r'#.*?$', re.MULTILINE),
|
|
'javascript': (r'//.*?$|/\*.*?\*/|\\/\\/.*?$|\\/\*.*?\*\\/', re.DOTALL | re.MULTILINE),
|
|
'php': (r'//.*?$|#.*?$|/\*.*?\*/|\\/\\/.*?$|#.*?$|\\/\*.*?\*\\/', re.DOTALL | re.MULTILINE),
|
|
'ruby': (r'#.*', re.MULTILINE),
|
|
'go': (r'//.*?$|/\*.*?\*/|\\/\\/.*?$|\\/\*.*?\*\\/', re.DOTALL | re.MULTILINE),
|
|
'csharp': (r'//.*?$|/\*.*?\*/|\\/\\/.*?$|\\/\*.*?\*\\/', re.DOTALL | re.MULTILINE),
|
|
'cplusplus': (r'//.*?$|/\*.*?\*/|\\/\\/.*?$|\\/\*.*?\*\\/', re.DOTALL | re.MULTILINE),
|
|
'c': (r'//.*?$|/\*.*?\*/|\\/\\/.*?$|\\/\*.*?\*\\/', re.DOTALL | re.MULTILINE),
|
|
'rust': (r'//.*?$|/\*.*?\*/|\\/\\/.*?$|\\/\*.*?\*\\/', re.DOTALL | re.MULTILINE),
|
|
'typescript': (r'//.*?$|/\*.*?\*/|\\/\\/.*?$|\\/\*.*?\*\\/', re.DOTALL | re.MULTILINE),
|
|
'perl': (r'#.*', re.MULTILINE),
|
|
'shell': (r'#.*', re.MULTILINE),
|
|
'sql': (r'--.*?$|/\*.*?\*/', re.DOTALL),
|
|
'batchfile': (r'^\s*(REM|@REM|::).*', re.MULTILINE | re.IGNORECASE),
|
|
'fortran': (r'!.*', re.MULTILINE),
|
|
'haskell': (r'--.*', re.MULTILINE),
|
|
'lua': (r'--.*?$|--\[\[.*?\]\]', re.DOTALL),
|
|
'powershell': (r'<#.*?#>|#.*', re.DOTALL),
|
|
'visual_basic': (r"'.*", re.MULTILINE),
|
|
}
|
|
|
|
# 执行注释清理
|
|
if lang in comment_patterns:
|
|
pattern, flags = comment_patterns[lang]
|
|
cleaned_code = re.sub(pattern, '', cleaned_code, flags=flags)
|
|
cleaned_code = clean_empty_lines(cleaned_code)
|
|
|
|
# 特殊语言处理规则
|
|
if lang == 'fortran':
|
|
cleaned_code = re.sub(r'^[Cc*].*', '', cleaned_code, flags=re.MULTILINE)
|
|
elif lang == 'sql':
|
|
cleaned_code = re.sub(r'/\*.*?\*/', '', cleaned_code, flags=re.DOTALL)
|
|
elif lang == 'python':
|
|
cleaned_code = re.sub(r'^\s*#.*', '', cleaned_code, flags=re.MULTILINE)
|
|
|
|
# 函数定义检测及内容验证
|
|
def has_valid_code(text: str, lang: str) -> bool:
|
|
pattern = function_patterns.get(lang)
|
|
if not pattern:
|
|
return len(text.strip()) > 0
|
|
|
|
# 增强检测逻辑
|
|
if lang == 'batchfile':
|
|
return bool(re.search(r'^\s*@?echo\b|:\w+', text, re.MULTILINE))
|
|
if lang == 'shell':
|
|
return bool(re.search(r'^\s*(if|for|while|case|echo|export|shopt|source)\b', text, re.MULTILINE))
|
|
if lang == 'python':
|
|
if re.search(r'^\s*(def|class)\s+\w+', text, re.MULTILINE):
|
|
return bool(re.search(r'^\s+[^\s#]', text, re.MULTILINE))
|
|
return False
|
|
if lang == 'ruby':
|
|
return bool(re.search(r'(def\s+\w+|class\s+\w+).*?\n\s+[^\s#]', text, re.MULTILINE))
|
|
return bool(re.search(pattern, text, re.DOTALL | re.MULTILINE))
|
|
|
|
# 最终有效性检查
|
|
if not has_valid_code(cleaned_code, lang):
|
|
return ''
|
|
|
|
cleaned_code = cleaned_code.strip('\ufeff').strip()
|
|
|
|
if len(cleaned_code) < length_threshold:
|
|
return ''
|
|
|
|
return cleaned_code
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_text = "\/\/ ----------------------------------------------------------------------\n\/\/ ----------------------------------------------------------------------\n\/\/\n\/\/ File: StrMaxProjection.h\n\/\/ Author: mgrosso \n\/\/ Created: Mon Jul 17 14:39:22 PDT 2006 on caliban\n\/\/ Project: \n\/\/ Purpose: \n\/\/ \n\/\/ $Id$\n\/\/ ----------------------------------------------------------------------\n\/\/ ----------------------------------------------------------------------\n\n#ifndef STRMAXPROJECTION_H\n#define STRMAXPROJECTION_H 1\n\n#include \"StrMinProjection.h\"\n\nclass StrMaxProjection : public StrMinProjection\n{\n public:\n StrMaxProjection(ExpressionPtr &operand);\n virtual ~StrMaxProjection();\n virtual AbstractProjectionPtr copy();\n\n protected:\n int compare(const char *lhs, const char *rhs);\n\n private:\n \/\/not implemented\n StrMaxProjection();\n StrMaxProjection( const StrMaxProjection &rhs );\n StrMaxProjection &operator=( const StrMaxProjection &rhs );\n};\n\n#endif \/* STRMAXPROJECTION_H *\/"
|
|
|
|
result = clean_code(test_text, "c", 200)
|
|
print(result)
|
|
|
|
test_text = "\/**\n * Copyright (c) Microsoft Corporation. All rights reserved.\n * Licensed under the MIT License. See License.txt in the project root for\n * license information.\n *\n * Code generated by Microsoft (R) AutoRest Code Generator.\n *\/\n\npackage com.microsoft.azure.management.datafactory.v2018_06_01;\n\nimport com.fasterxml.jackson.annotation.JsonProperty;\nimport com.fasterxml.jackson.annotation.JsonTypeInfo;\nimport com.fasterxml.jackson.annotation.JsonTypeName;\n\n\/**\n * The location of Google Cloud Storage dataset.\n *\/\n@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = \"type\", defaultImpl = GoogleCloudStorageLocation.class)\n@JsonTypeName(\"GoogleCloudStorageLocation\")\npublic class GoogleCloudStorageLocation extends DatasetLocation {\n \/**\n * Specify the bucketName of Google Cloud Storage. Type: string (or\n * Expression with resultType string).\n *\/\n @JsonProperty(value = \"bucketName\")\n private Object bucketName;\n\n \/**\n * Specify the version of Google Cloud Storage. Type: string (or Expression\n * with resultType string).\n *\/\n @JsonProperty(value = \"version\")\n private Object version;\n\n \/**\n * Get specify the bucketName of Google Cloud Storage. Type: string (or Expression with resultType string).\n *\n * @return the bucketName value\n *\/\n public Object bucketName() {\n return this.bucketName;\n }\n\n \/**\n * Set specify the bucketName of Google Cloud Storage. Type: string (or Expression with resultType string).\n *\n * @param bucketName the bucketName value to set\n * @return the GoogleCloudStorageLocation object itself.\n *\/\n public GoogleCloudStorageLocation withBucketName(Object bucketName) {\n this.bucketName = bucketName;\n return this;\n }\n\n \/**\n * Get specify the version of Google Cloud Storage. Type: string (or Expression with resultType string).\n *\n * @return the version value\n *\/\n public Object version() {\n return this.version;\n }\n\n \/**\n * Set specify the version of Google Cloud Storage. Type: string (or Expression with resultType string).\n *\n * @param version the version value to set\n * @return the GoogleCloudStorageLocation object itself.\n *\/\n public GoogleCloudStorageLocation withVersion(Object version) {\n this.version = version;\n return this;\n }\n\n}"
|
|
result = clean_code(test_text, "java", 200)
|
|
print(result)
|