import type { FileUploadCallback, TextExtractorErrorCallback } from '@/components/ExtractTextFromUpload/types';
import config from '@/config/frontend';
import type { FileMetadata } from '@magicschool/business-logic/tools';
import type { Language } from '@magicschool/supabase/types';
import { type ExtractTextFromImageOptions, extractTextFromImage } from './image';
import { extractTextFromDocx, extractTextFromPowerpoint } from './office';
import { extractTextFromPDF } from './pdf';

export function fitFilesInWordLimit(files: FileMetadata[]): FileMetadata[] {
  const { maxWordsPerDocument, maxWordsPerGeneration } = config.textExtract;
  const truncatedFiles: FileMetadata[] = [];
  let totalWords = 0;

  for (const file of files) {
    const realNumberOfWords = Math.min(file.numberOfWords ?? 0, maxWordsPerDocument);

    if (totalWords + realNumberOfWords <= maxWordsPerGeneration) {
      totalWords += realNumberOfWords;
      truncatedFiles.push({ ...file });
    } else {
      const wordsLeft = maxWordsPerGeneration - totalWords;
      totalWords += wordsLeft;

      const words = file.content?.split(/\s+/);
      truncatedFiles.push({ ...file, content: words?.slice(0, wordsLeft).join(' ') });
    }
  }

  return truncatedFiles;
}

export function processExtractedText(text: string, locale: Language, onWarning: TextExtractorErrorCallback) {
  // Remove null (\u0000) and unknown char (\ued02), breaks postgres JSON/JSONB
  text = text.trim().replaceAll(/\0/g, '');

  // Get number of words (separated by space)
  const words = text.split(/[\s\n\r\t]+/g);
  const numberOfWords = words.length;

  // If more than the limit, truncate the text
  if (numberOfWords > config.textExtract.maxWordsPerDocument) {
    const truncatedText = words.slice(0, config.textExtract.maxWordsPerDocument).join(' ');
    text = truncatedText;

    // Warn users that too much words were extracted
    onWarning('extract_text.error.too_much_text', { words: numberOfWords, wordLimit: config.textExtract.maxWordsPerDocument });
  }

  // If space:other ratio is too high, PDF might have spaces in between letters due to incorrect OCR extraction
  const pdfConfig = config.textExtract.pdf;
  const isLanguageWithoutALotOfSpaces = pdfConfig.languagesWithoutALotOfSpaces.includes(locale as Language);
  const letterToSpaceRatio = text.length / numberOfWords;
  if (letterToSpaceRatio < pdfConfig.otherToSpaceRatioErrorThreshold && isLanguageWithoutALotOfSpaces) {
    onWarning('extract_text.error.spaces_in_between');
  }

  return { numberOfWords, text };
}

interface ExtractTextFromFileOptions extends ExtractTextFromImageOptions {}

export async function extractTextFromFile(
  file: File,
  id: string,
  locale: Language,
  onUpload: FileUploadCallback,
  onError: TextExtractorErrorCallback,
  onWarning: TextExtractorErrorCallback,
  options?: ExtractTextFromFileOptions,
) {
  let fileMetadata: FileMetadata | null = null;
  let rawText = '';

  switch (file.type) {
    case 'application/pdf':
      {
        fileMetadata = { id, name: file.name, type: 'pdf' };
        onUpload(fileMetadata);
        const pdfResult = await extractTextFromPDF(file, locale, onError, options);
        rawText = pdfResult.text;
        fileMetadata = {
          ...fileMetadata,
          textExtractionMethod: pdfResult.method,
          ocrProvider: pdfResult.method === 'ocr' ? options?.extractionMethod : undefined,
        };
      }
      break;
    case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
      fileMetadata = { id, name: file.name, type: 'docx' };
      onUpload(fileMetadata);
      rawText = await extractTextFromDocx(file, onError);
      break;
    case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
      fileMetadata = { id, name: file.name, type: 'pptx' };
      onUpload(fileMetadata);
      rawText = await extractTextFromPowerpoint(file, onError);
      break;
    case 'image/jpeg':
    case 'image/png':
    case 'image/tiff':
    case 'image/webp':
      fileMetadata = { id, name: file.name, type: 'image' };
      onUpload(fileMetadata);
      rawText = await extractTextFromImage(file, locale, onError, options);
      fileMetadata = { ...fileMetadata, ocrProvider: options?.extractionMethod };
      break;
    default:
      onError('extract_text.error.invalid_file');
  }

  if (!fileMetadata) return null;

  // Post-process the extracted text and check if there is any problems
  const { numberOfWords, text } = processExtractedText(rawText, locale, onWarning);
  return { ...fileMetadata, numberOfWords, content: text };
}

export const mimeTypeToFileType: Record<string, string> = {
  'application/vnd.google-apps.document': 'google-doc',
  'application/vnd.google-apps.presentation': 'google-slide',
  'application/pdf': 'pdf',
  'image/jpeg': 'image',
  'image/png': 'image',
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
  'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
};
