iconsulting/packages/services/knowledge-service/src/application/services/text-extraction.service.ts

136 lines
3.7 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Text Extraction Service
* 从上传的文件中提取文本内容PDF、Word、TXT、Markdown
*/
import { Injectable, BadRequestException, Logger } from '@nestjs/common';
import { PDFParse } from 'pdf-parse';
import * as mammoth from 'mammoth';
export interface ExtractedContent {
text: string;
title: string;
pageCount?: number;
wordCount: number;
}
const ALLOWED_MIME_TYPES = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'text/plain',
'text/markdown',
];
const MAX_FILE_SIZE = 200 * 1024 * 1024; // 200MB
@Injectable()
export class TextExtractionService {
private readonly logger = new Logger(TextExtractionService.name);
validateFile(file: Express.Multer.File): void {
if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
throw new BadRequestException(
`不支持的文件类型: ${file.mimetype}。支持: PDF, DOCX, TXT, MD`,
);
}
if (file.size > MAX_FILE_SIZE) {
throw new BadRequestException(
`文件过大: ${(file.size / 1024 / 1024).toFixed(1)}MB最大支持 200MB`,
);
}
}
async extractText(file: Express.Multer.File): Promise<ExtractedContent> {
this.validateFile(file);
const title = this.deriveTitleFromFilename(file.originalname);
this.logger.log(
`Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`,
);
switch (file.mimetype) {
case 'application/pdf':
return this.extractFromPdf(file.buffer, title);
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
return this.extractFromDocx(file.buffer, title);
case 'text/plain':
case 'text/markdown':
return this.extractFromText(file.buffer, title);
default:
throw new BadRequestException(`不支持的文件类型: ${file.mimetype}`);
}
}
private async extractFromPdf(
buffer: Buffer,
title: string,
): Promise<ExtractedContent> {
const parser = new PDFParse({ data: new Uint8Array(buffer), verbosity: 0 });
const result = await parser.getText();
if (!result.text || result.text.trim().length === 0) {
await parser.destroy();
throw new BadRequestException(
'无法从 PDF 中提取文本。该文件可能是扫描件或纯图片 PDF。',
);
}
const text = result.text.trim();
const pageCount = result.total;
await parser.destroy();
return {
text,
title,
pageCount,
wordCount: this.countWords(text),
};
}
private async extractFromDocx(
buffer: Buffer,
title: string,
): Promise<ExtractedContent> {
const result = await mammoth.extractRawText({ buffer });
if (!result.value || result.value.trim().length === 0) {
throw new BadRequestException('无法从 Word 文档中提取文本。');
}
return {
text: result.value.trim(),
title,
wordCount: this.countWords(result.value),
};
}
private extractFromText(
buffer: Buffer,
title: string,
): ExtractedContent {
const text = buffer.toString('utf-8');
return {
text: text.trim(),
title,
wordCount: this.countWords(text),
};
}
private deriveTitleFromFilename(filename: string): string {
return filename
.replace(/\.[^.]+$/, '')
.replace(/[-_]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
private countWords(text: string): number {
const chineseChars = (text.match(/[\u4e00-\u9fff]/g) || []).length;
const englishWords = text
.replace(/[\u4e00-\u9fff]/g, '')
.split(/\s+/)
.filter((w) => w.length > 0).length;
return chineseChars + englishWords;
}
}