136 lines
3.7 KiB
TypeScript
136 lines
3.7 KiB
TypeScript
/**
|
||
* Text Extraction Service
|
||
* 从上传的文件中提取文本内容(PDF、Word、TXT、Markdown)
|
||
*/
|
||
|
||
import { Injectable, BadRequestException, Logger } from '@nestjs/common';
|
||
import { PDFParse } from 'pdf-parse';
|
||
import * as mammoth from 'mammoth';
|
||
|
||
export interface ExtractedContent {
|
||
text: string;
|
||
title: string;
|
||
pageCount?: number;
|
||
wordCount: number;
|
||
}
|
||
|
||
const ALLOWED_MIME_TYPES = [
|
||
'application/pdf',
|
||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||
'text/plain',
|
||
'text/markdown',
|
||
];
|
||
|
||
const MAX_FILE_SIZE = 200 * 1024 * 1024; // 200MB
|
||
|
||
@Injectable()
|
||
export class TextExtractionService {
|
||
private readonly logger = new Logger(TextExtractionService.name);
|
||
|
||
validateFile(file: Express.Multer.File): void {
|
||
if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
|
||
throw new BadRequestException(
|
||
`不支持的文件类型: ${file.mimetype}。支持: PDF, DOCX, TXT, MD`,
|
||
);
|
||
}
|
||
if (file.size > MAX_FILE_SIZE) {
|
||
throw new BadRequestException(
|
||
`文件过大: ${(file.size / 1024 / 1024).toFixed(1)}MB,最大支持 200MB`,
|
||
);
|
||
}
|
||
}
|
||
|
||
async extractText(file: Express.Multer.File): Promise<ExtractedContent> {
|
||
this.validateFile(file);
|
||
|
||
const title = this.deriveTitleFromFilename(file.originalname);
|
||
this.logger.log(
|
||
`Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`,
|
||
);
|
||
|
||
switch (file.mimetype) {
|
||
case 'application/pdf':
|
||
return this.extractFromPdf(file.buffer, title);
|
||
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
|
||
return this.extractFromDocx(file.buffer, title);
|
||
case 'text/plain':
|
||
case 'text/markdown':
|
||
return this.extractFromText(file.buffer, title);
|
||
default:
|
||
throw new BadRequestException(`不支持的文件类型: ${file.mimetype}`);
|
||
}
|
||
}
|
||
|
||
private async extractFromPdf(
|
||
buffer: Buffer,
|
||
title: string,
|
||
): Promise<ExtractedContent> {
|
||
const parser = new PDFParse({ data: new Uint8Array(buffer), verbosity: 0 });
|
||
const result = await parser.getText();
|
||
|
||
if (!result.text || result.text.trim().length === 0) {
|
||
await parser.destroy();
|
||
throw new BadRequestException(
|
||
'无法从 PDF 中提取文本。该文件可能是扫描件或纯图片 PDF。',
|
||
);
|
||
}
|
||
|
||
const text = result.text.trim();
|
||
const pageCount = result.total;
|
||
await parser.destroy();
|
||
|
||
return {
|
||
text,
|
||
title,
|
||
pageCount,
|
||
wordCount: this.countWords(text),
|
||
};
|
||
}
|
||
|
||
private async extractFromDocx(
|
||
buffer: Buffer,
|
||
title: string,
|
||
): Promise<ExtractedContent> {
|
||
const result = await mammoth.extractRawText({ buffer });
|
||
|
||
if (!result.value || result.value.trim().length === 0) {
|
||
throw new BadRequestException('无法从 Word 文档中提取文本。');
|
||
}
|
||
|
||
return {
|
||
text: result.value.trim(),
|
||
title,
|
||
wordCount: this.countWords(result.value),
|
||
};
|
||
}
|
||
|
||
private extractFromText(
|
||
buffer: Buffer,
|
||
title: string,
|
||
): ExtractedContent {
|
||
const text = buffer.toString('utf-8');
|
||
return {
|
||
text: text.trim(),
|
||
title,
|
||
wordCount: this.countWords(text),
|
||
};
|
||
}
|
||
|
||
private deriveTitleFromFilename(filename: string): string {
|
||
return filename
|
||
.replace(/\.[^.]+$/, '')
|
||
.replace(/[-_]/g, ' ')
|
||
.replace(/\s+/g, ' ')
|
||
.trim();
|
||
}
|
||
|
||
private countWords(text: string): number {
|
||
const chineseChars = (text.match(/[\u4e00-\u9fff]/g) || []).length;
|
||
const englishWords = text
|
||
.replace(/[\u4e00-\u9fff]/g, '')
|
||
.split(/\s+/)
|
||
.filter((w) => w.length > 0).length;
|
||
return chineseChars + englishWords;
|
||
}
|
||
}
|