import type { TextExtractorErrorCallback } from '@/components/ExtractTextFromUpload/types';
import config from '@/config/frontend';
import { logger } from '@magicschool/logger';
import JSZip from 'jszip';

export function extractTextFromDocx(file: File, onError: TextExtractorErrorCallback) {
  return new Promise<string>((resolve) => {
    const wordConfig = config.textExtract.word;

    const reader = new FileReader();
    reader.onload = async (event) => {
      try {
        const arrayBuffer = event.target?.result as ArrayBuffer;

        // Try to use mammoth to extract text from the docx file
        const res = await window.mammoth.extractRawText({ arrayBuffer });
        if (res.value) return resolve(res.value);

        // If no text was extracted, try parsing manually
        const decoder = new TextDecoder();
        const docxString = decoder.decode(arrayBuffer);

        // Find the docx boundary
        const match = docxString.match(wordConfig.findBoundaryRegex);
        if (!match?.[1]) {
          throw new Error('word_extract.error.invalid_docx');
        }
        const boundary = match[1];

        // Split the document into parts and try to find a part that has HTML
        // Ignore the first part, because it is just metadata
        const docParts = docxString.split(boundary).slice(1);
        for (const docPart of docParts) {
          const contentTypeMatch = docPart.includes(wordConfig.textContentTypeString);
          if (!contentTypeMatch) continue;

          let rawText = docPart.replaceAll(wordConfig.stripUnneededContentRegex, '');
          return resolve(rawText);
        }

        // If no HTML was found, throw an error
        onError('word_extract.error.invalid_docx');
      } catch (e: any) {
        onError('word_extract.error.unknown_error');
        logger.error('Error extracting text from word document', e.message);
      }

      resolve('');
    };

    reader.readAsArrayBuffer(file);
  });
}

// Gets text from nodes with a specific tag name and namespace URI
function getTextFromNodes(node: Document, tagName: string, namespaceURI: string) {
  let text = '';
  const textNodes = node.getElementsByTagNameNS(namespaceURI, tagName);
  for (const key in textNodes) {
    const nodeText = textNodes[key].textContent;
    if (!nodeText) continue;

    text += `${textNodes[key].textContent} `;
  }
  return text.trim();
}

// Extracts text from a PPTX file
async function getTextFromPPTX(arrayBuffer: ArrayBuffer) {
  try {
    const zip = new JSZip();
    await zip.loadAsync(arrayBuffer);

    const aNamespace = 'http://schemas.openxmlformats.org/drawingml/2006/main';
    let text = '';

    let slideIndex = 1;
    while (true) {
      const slideFile = zip.file(`ppt/slides/slide${slideIndex}.xml`);

      if (!slideFile) break;

      const slideXmlStr = await slideFile.async('text');

      const parser = new DOMParser();
      const xmlDoc = parser.parseFromString(slideXmlStr, 'application/xml');

      text += `${getTextFromNodes(xmlDoc, 't', aNamespace)} `;

      slideIndex++;
    }

    return text.trim();
  } catch (err) {
    console.error('Error extracting text from PPTX:', err);
    return '';
  }
}

export function extractTextFromPowerpoint(file: File, onError: TextExtractorErrorCallback) {
  return new Promise<string>((resolve) => {
    const reader = new FileReader();
    reader.onload = async (event) => {
      try {
        const arrayBuffer = event.target?.result as ArrayBuffer;
        resolve(await getTextFromPPTX(arrayBuffer));
      } catch (e: any) {
        onError('extract_text.error.unknown_error');
        logger.error('Error extracting text from powerpoint document', e.message);
        resolve('');
      }
    };

    reader.readAsArrayBuffer(file);
  });
}
