import type { TextExtractorErrorCallback } from '@/components/ExtractTextFromUpload/types';
import config from '@/config/frontend';
import { logger } from '@magicschool/logger';
import type { Language } from '@magicschool/supabase/types';
import type { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist';
import { checkIfFileHasValidMimeType } from '../files/util';
import { type ExtractTextFromImageOptions, describeImages } from './image';
import { imageTextExtractor } from './tesseract';

// These are taken from pdfjs, since we can't import them directly. They don't seem to ever change
const PAINT_IMAGE_XOBJECT = 85;
const PAINT_INLINE_IMAGE_XOBJECT = 86;

const BREAKPOINT_RATIO_OF_IMAGES_TO_WORDS = 1 / 100;

export async function createPDFDocument(pdf: File) {
  // Import like this because Nextjs will include in every page otherwise
  window.pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://unpkg.com/pdfjs-dist@4.2.67/build/pdf.worker.min.mjs';

  return await window.pdfjsLib.getDocument(URL.createObjectURL(pdf)).promise;
}

export async function getPDFPages(pdf: PDFDocumentProxy) {
  return await Promise.all(new Array(pdf.numPages).fill(0).map((_, i) => pdf.getPage(i + 1)));
}

export async function extractTextFromPDFPages(pages: PDFPageProxy[]) {
  const pagesTextContent = await Promise.all(pages.map(async (page) => page.getTextContent()));
  return pagesTextContent.map((page) =>
    page.items
      .map((item) => (item as any).str)
      .filter((s) => s)
      .join(' '),
  );
}

async function renderPDFPageToCanvas(page: PDFPageProxy) {
  const viewport = page.getViewport({ scale: 1.5 });
  const canvas = document.createElement('canvas');
  const ctx = canvas.getContext('2d');
  canvas.height = viewport.height;
  canvas.width = viewport.width;

  if (!ctx) return null;

  // Render PDF page into canvas context
  const renderContext = {
    canvasContext: ctx,
    viewport: viewport,
  };
  await page.render(renderContext).promise;

  return canvas;
}

async function extractTextUsingOpenAI(pages: PDFPageProxy[], language: Language) {
  // Convert all pages to images
  const images = await Promise.all(
    pages.map(async (page) => {
      const canvas = await renderPDFPageToCanvas(page);
      if (!canvas) return null;
      return canvas.toDataURL('image/jpeg');
    }),
  );

  // Send the images to OpenAI for OCR
  return await describeImages(
    images.filter((i) => i !== null),
    language,
  );
}

async function extractTextUsingTesseract(pages: PDFPageProxy[], language: Language) {
  // Reconfigure the global text extractor to the current locale
  // If the locale & number of workers match (which they do), this will allow this extractor to be shared across all uploader components
  await imageTextExtractor.initialize(language);
  // Loop through each page, get the pdf page -> image -> ocr pipeline started for each
  const text = await Promise.all(
    pages.map(async (page) => {
      const canvas = await renderPDFPageToCanvas(page);
      if (!canvas) return null;
      return await imageTextExtractor.extract(canvas);
    }),
  );
  // filter out null pages
  return text.filter((t) => t !== null).join('\n\n');
}

interface PDFOCROptions extends ExtractTextFromImageOptions {
  language: Language;
  workerCount?: number;
}

export async function extractTextFromPDFConcurrently(pages: PDFPageProxy[], { language, extractionMethod }: PDFOCROptions) {
  switch (extractionMethod) {
    case 'tesseract':
      return await extractTextUsingTesseract(pages, language);
    case 'openai':
      return await extractTextUsingOpenAI(pages, language);
  }
}

export function checkIfTextWasExtracted(text: string) {
  // Do some heuristics to determine if the text is parsed
  return text.trim().replaceAll('\n', '').length > config.textExtract.pdf.textDidNotParseErrorCharThreshold;
}

async function getNumberOfImagesInPDF(pdf: PDFDocumentProxy) {
  let numberOfImages = 0;
  for (let i = 1; i <= pdf.numPages; i++) {
    const page = await pdf.getPage(i);
    const operatorList = await page.getOperatorList();

    for (const fn of operatorList.fnArray) {
      if (fn === PAINT_IMAGE_XOBJECT || fn === PAINT_INLINE_IMAGE_XOBJECT) {
        numberOfImages++;
      }
    }
  }
  return numberOfImages;
}

interface ExtractTextFromPDFOptions extends ExtractTextFromImageOptions {}

interface ExtractTextFromPDFResponse {
  text: string;
  method: 'ocr' | 'text';
}

// Full flow of text extraction from a PDF file
export async function extractTextFromPDF(
  file: File,
  locale: Language,
  onError: TextExtractorErrorCallback,
  { extractionMethod }: ExtractTextFromPDFOptions = { extractionMethod: 'tesseract' },
): Promise<ExtractTextFromPDFResponse> {
  const isFileValid = await checkIfFileHasValidMimeType(file, ['pdf']);
  if (!isFileValid) {
    onError('pdf_extract.error.invalid_file');
    return { text: '', method: 'text' };
  }

  try {
    const pdf = await createPDFDocument(file);
    const pages = await getPDFPages(pdf);

    // Get all plain text and number of images in the PDF
    const [textPerPage, numberOfImages] = await Promise.all([extractTextFromPDFPages(pages), getNumberOfImagesInPDF(pdf)]);

    const text = textPerPage.join('\n\n');
    const words = text.split(/\s+/);

    // If there are too many images, then use OCR
    if (numberOfImages / words.length > BREAKPOINT_RATIO_OF_IMAGES_TO_WORDS) {
      // Try extracting text using OCR
      const ocrText = await extractTextFromPDFConcurrently(pages, { language: locale as Language, extractionMethod });
      if (checkIfTextWasExtracted(ocrText)) return { text: ocrText, method: 'ocr' };
    } else {
      if (!checkIfTextWasExtracted(text)) onError('pdf_extract.error.no_text_found');
      return { text, method: 'text' };
    }
  } catch (e: any) {
    onError('pdf_extract.error.unknown_error');
    logger.error('Error extracting text from PDF', e.message);
  }

  return { text: '', method: 'text' };
}
