iconsulting/packages/services/knowledge-service/src/application/services/text-extraction.service.ts

/**
 * Text Extraction Service
 * 从上传的文件中提取文本内容（PDF、Word、TXT、Markdown）
 */

import { Injectable, BadRequestException, Logger } from '@nestjs/common';
import { PDFParse } from 'pdf-parse';
import * as mammoth from 'mammoth';

export interface ExtractedContent {
  text: string;
  title: string;
  pageCount?: number;
  wordCount: number;
}

const ALLOWED_MIME_TYPES = [
  'application/pdf',
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
  'text/plain',
  'text/markdown',
];

const MAX_FILE_SIZE = 200 * 1024 * 1024; // 200MB

@Injectable()
export class TextExtractionService {
  private readonly logger = new Logger(TextExtractionService.name);

  validateFile(file: Express.Multer.File): void {
    if (!ALLOWED_MIME_TYPES.includes(file.mimetype)) {
      throw new BadRequestException(
        `不支持的文件类型: ${file.mimetype}。支持: PDF, DOCX, TXT, MD`,
      );
    }
    if (file.size > MAX_FILE_SIZE) {
      throw new BadRequestException(
        `文件过大: ${(file.size / 1024 / 1024).toFixed(1)}MB，最大支持 200MB`,
      );
    }
  }

  async extractText(file: Express.Multer.File): Promise<ExtractedContent> {
    this.validateFile(file);

    const title = this.deriveTitleFromFilename(file.originalname);
    this.logger.log(
      `Extracting text from "${file.originalname}" (${file.mimetype}, ${(file.size / 1024).toFixed(0)}KB)`,
    );

    switch (file.mimetype) {
      case 'application/pdf':
        return this.extractFromPdf(file.buffer, title);
      case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
        return this.extractFromDocx(file.buffer, title);
      case 'text/plain':
      case 'text/markdown':
        return this.extractFromText(file.buffer, title);
      default:
        throw new BadRequestException(`不支持的文件类型: ${file.mimetype}`);
    }
  }

  private async extractFromPdf(
    buffer: Buffer,
    title: string,
  ): Promise<ExtractedContent> {
    const parser = new PDFParse({ data: new Uint8Array(buffer), verbosity: 0 });
    const result = await parser.getText();

    if (!result.text || result.text.trim().length === 0) {
      await parser.destroy();
      throw new BadRequestException(
        '无法从 PDF 中提取文本。该文件可能是扫描件或纯图片 PDF。',
      );
    }

    const text = result.text.trim();
    const pageCount = result.total;
    await parser.destroy();

    return {
      text,
      title,
      pageCount,
      wordCount: this.countWords(text),
    };
  }

  private async extractFromDocx(
    buffer: Buffer,
    title: string,
  ): Promise<ExtractedContent> {
    const result = await mammoth.extractRawText({ buffer });

    if (!result.value || result.value.trim().length === 0) {
      throw new BadRequestException('无法从 Word 文档中提取文本。');
    }

    return {
      text: result.value.trim(),
      title,
      wordCount: this.countWords(result.value),
    };
  }

  private extractFromText(
    buffer: Buffer,
    title: string,
  ): ExtractedContent {
    const text = buffer.toString('utf-8');
    return {
      text: text.trim(),
      title,
      wordCount: this.countWords(text),
    };
  }

  private deriveTitleFromFilename(filename: string): string {
    return filename
      .replace(/\.[^.]+$/, '')
      .replace(/[-_]/g, ' ')
      .replace(/\s+/g, ' ')
      .trim();
  }

  private countWords(text: string): number {
    const chineseChars = (text.match(/[\u4e00-\u9fff]/g) || []).length;
    const englishWords = text
      .replace(/[\u4e00-\u9fff]/g, '')
      .split(/\s+/)
      .filter((w) => w.length > 0).length;
    return chineseChars + englishWords;
  }
}