// Matches multiple CandidateMap against one PDF file by examining its text content and looking for the provided markers.
import promiseLimit from 'p-limit'
import {
	CandidateMap,
	MatchCertainty,
	MatchResult,
	matchSubjectsToCandidates,
	matchSubjectToCandidates,
} from './StringMatcher'
import { FileParsingState, MatcherFileInUploadQueue } from '~/app/domain/assets/FileUpload'
import {
	getBestGuessedTextValueNextToMarker,
	getComposedStringsOfPdfFile,
	getFirstStringAfterFirstMarker,
	getGuessFromLineWithTextMarker,
	readPdf,
} from './PdfParser'
import type { ModelValue as Area } from '~/components/inputs/InputPdfArea.vue'

export async function matchQueuedPdfFilesToCandidates(
	candidates: CandidateMap,
	pdfsInQueue: MatcherFileInUploadQueue[],
	marker: string,
	onPdfMatched?: (pdf: MatcherFileInUploadQueue, match: MatchResult | null) => unknown,
	area?: Area,
	abortSignal?: AbortSignal,
) {
	// PERFORMANCE NOTICE:
	// limiter configured after testing with 100 5-paged PDFs
	// Performance peaked at 4 concurrent workers and degraded slowly with more
	// Memory consumption was proportional to the amount of concurrent workers, resulting in crashes for too many workers
	// const limiter = promiseLimit(4) // fixme: concurrency above 1 results in some promises never being resolved (mostly on FF, partly on Chrome)
	const limiter = promiseLimit(1)

	const makeMatchPromise = async (pdfInQueue: MatcherFileInUploadQueue) => {
		if (abortSignal?.aborted) {
			throw new Error('Aborted')
		}
		return new Promise((resolve) => {
			pdfInQueue.parsingState = FileParsingState.READING
			matchPdfTextContentToCandidates(
				candidates,
				pdfInQueue.file,
				marker,
				(match: MatchResult | null) => {
					pdfInQueue.parsingState = FileParsingState.READ
					// Invoke callbacks, if provided.
					if (onPdfMatched) {
						onPdfMatched(pdfInQueue, match)
					}
					resolve(match)

					return match
				},
				area,
			)
		})
	}

	// create a promise containing all the limited promise generators
	return await Promise.all(
		pdfsInQueue.map((file) => {
			return limiter(async () => makeMatchPromise(file))
		}),
	)
}

/**
 * @param  candidates  The strings to find in each file
 * @param  file  The loaded PDF file to match against.
 * @param  marker  An array of strings identifying what parts of the document to match against the candidates, i.e. providing keywords.
 * @param  onPdfMatched  An optional callback function that is to be called on match result.
 */
const matchPdfTextContentToCandidates = async (
	candidates: CandidateMap,
	file: File,
	marker: string,
	onPdfMatched: (match: MatchResult | null) => unknown,
	area?: Area,
) => {
	const readFile = await readPdf(file)
	try {
		if (!readFile) {
			onPdfMatched(null)
			return
		}

		// check if we can find a simple text node containing the marker and something else
		const bestGuess = await getBestGuessedTextValueNextToMarker(readFile, marker, area)
		if (bestGuess) {
			const bestGuessMatch = matchSubjectToCandidates(candidates, bestGuess, bestGuess)

			if (bestGuessMatch && bestGuessMatch.certainty >= MatchCertainty.confident) {
				onPdfMatched(bestGuessMatch)
				return
			}
		}

		// check if we can find a reasonable value in the text item directly after the first marker
		const nextBestGuess = await getFirstStringAfterFirstMarker(readFile, marker, area)
		if (nextBestGuess) {
			// this match could be nonsense, because we just picked the next text content. So we only use this strategy if it provides a good match
			const nextBestMatch = matchSubjectToCandidates(candidates, nextBestGuess, nextBestGuess)
			if (nextBestMatch && nextBestMatch.certainty >= MatchCertainty.confident) {
				onPdfMatched(nextBestMatch)
				return
			}
		}

		// check if we can find a reasonable value in the same line after the first marker
		const guessFromLine = await getGuessFromLineWithTextMarker(readFile, marker, area)
		if (guessFromLine) {
			// this match could be nonsense, because we just picked text in the line after the marker. So we only use this strategy if it provides a good match
			const bestMatchFromLine = matchSubjectToCandidates(candidates, guessFromLine, guessFromLine)
			if (bestMatchFromLine && bestMatchFromLine.certainty >= MatchCertainty.confident) {
				onPdfMatched(bestMatchFromLine)
				return
			}
		}

		// Use PDF.js to load text content of file.
		const pages = await getComposedStringsOfPdfFile(readFile, area)

		// Text content might not be available
		if (!pages) {
			onPdfMatched(null)
			return
		}

		// Go through the extracted pages and search for strings that contain our wanted marker
		const strings = pages.reduce((strings, page) => {
			return strings.concat(page.filter((str) => str.toUpperCase().includes(marker.toUpperCase())))
		}, [])

		// Find best match of our candidates within the filtered strings.
		const matches = matchSubjectsToCandidates(candidates, strings)
		const best = matches.reduce((best, match) => {
			if (match) {
				if (best === null || match.certainty > best.certainty) {
					return match
				}
			}

			return best
		}, null as MatchResult | null)

		onPdfMatched(best)
	} catch (e) {
		if (e instanceof Error) throw e
		throw new Error(String(e))
	} finally {
		readFile?.destroy()
	}
}
