/*
 * Decompiled with CFR 0.152.
 */
package com.itextpdf.pdfocr.tesseract4;

import com.itextpdf.commons.utils.MessageFormatUtil;
import com.itextpdf.commons.utils.SystemUtil;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.pdfocr.TextInfo;
import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties;
import com.itextpdf.pdfocr.tesseract4.TextPositioning;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrInputTesseract4Exception;
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrTesseract4Exception;
import com.itextpdf.styledxmlparser.jsoup.Jsoup;
import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
import com.itextpdf.styledxmlparser.jsoup.select.Elements;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TesseractHelper {
    private static final Logger LOGGER = LoggerFactory.getLogger(TesseractHelper.class);
    private static final Pattern BBOX_PATTERN = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
    private static final Pattern BBOX_COORDINATE_PATTERN = Pattern.compile(".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
    private static final Pattern WCONF_PATTERN = Pattern.compile("^.*(x_wconf *\\d+).*$");
    private static final int BBOX_ARRAY_SIZE = 4;
    private static final int LEFT_IDX = 0;
    private static final int TOP_IDX = 1;
    private static final int RIGHT_IDX = 2;
    private static final int BOTTOM_IDX = 3;
    private static final float PX_TO_PT = 0.75f;
    private static final String NEW_LINE_PATTERN = "\n+";
    private static final String SPACE_PATTERN = " +";
    private static final String NEW_LINE_OR_SPACE_PATTERN = "[\n ]+";
    private static final String PAGE_PREFIX_PATTERN = "page_";
    private static final String OCR_PAGE = "ocr_page";
    private static final String OCR_LINE = "ocr_line";
    private static final String OCR_CAPTION = "ocr_caption";
    private static final String OCRX_WORD = "ocrx_word";
    private static final String TITLE = "title";
    private static final String X_WCONF = "x_wconf";

    private TesseractHelper() {
    }

    static Map<Integer, List<TextInfo>> parseHocrFile(List<File> inputFiles, List<File> txtInputFiles, Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) throws IOException {
        LinkedHashMap<Integer, List<TextInfo>> imageData = new LinkedHashMap<Integer, List<TextInfo>>();
        LinkedHashMap<String, Node> unparsedBBoxes = new LinkedHashMap<String, Node>();
        for (int inputFileIdx = 0; inputFileIdx < inputFiles.size(); ++inputFileIdx) {
            File inputFile = inputFiles.get(inputFileIdx);
            List<String> txt = null;
            if (txtInputFiles != null) {
                File txtInputFile = txtInputFiles.get(inputFileIdx);
                txt = Files.readAllLines(txtInputFile.toPath(), StandardCharsets.UTF_8);
            }
            if (inputFile == null || !Files.exists(Paths.get(inputFile.getAbsolutePath(), new String[0]), new LinkOption[0])) continue;
            FileInputStream fileInputStream = new FileInputStream(inputFile.getAbsolutePath());
            Document doc = Jsoup.parse((InputStream)fileInputStream, (String)StandardCharsets.UTF_8.name(), (String)inputFile.getAbsolutePath());
            Elements pages = doc.getElementsByClass(OCR_PAGE);
            for (Element page : pages) {
                String[] pageNum = page.id().split(PAGE_PREFIX_PATTERN);
                int pageNumber = Integer.parseInt(pageNum[pageNum.length - 1]);
                List<TextInfo> textData = TesseractHelper.getTextData(page, tesseract4OcrEngineProperties, txt, unparsedBBoxes);
                if (textData.size() <= 0) continue;
                if (imageData.containsKey(pageNumber)) {
                    pageNumber = (Integer)Collections.max(imageData.keySet()) + 1;
                }
                imageData.put(pageNumber, textData);
            }
            fileInputStream.close();
        }
        for (Node node : unparsedBBoxes.values()) {
            LOGGER.warn(MessageFormatUtil.format((String)"Cannot parse node BBox, defaults to 0, 0, 0, 0. Node: {0}", (Object[])new Object[]{node.toString()}));
        }
        return imageData;
    }

    static Rectangle getAlignedBBox(Element object, TextPositioning textPositioning, Rectangle pageBbox, Map<String, Node> unparsedBBoxes) {
        Rectangle box = TesseractHelper.parseBBox((Node)object, pageBbox, unparsedBBoxes);
        if (TextPositioning.BY_WORDS_AND_LINES == textPositioning || TextPositioning.BY_WORDS == textPositioning) {
            Node line = object.parent();
            Rectangle lineBbox = TesseractHelper.parseBBox(line, pageBbox, unparsedBBoxes);
            if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) {
                box.setBbox(box.getLeft(), lineBbox.getBottom(), box.getRight(), lineBbox.getTop());
            }
            TesseractHelper.detectAndFixBrokenBBoxes(object, box, lineBbox, pageBbox, unparsedBBoxes);
        }
        return box;
    }

    static Rectangle parseBBox(Node node, Rectangle pageBBox, Map<String, Node> unparsedBBoxes) {
        Matcher bboxCoordinateMatcher;
        List<Object> bbox = new ArrayList<Float>();
        Matcher bboxMatcher = BBOX_PATTERN.matcher(node.attr(TITLE));
        if (bboxMatcher.matches() && (bboxCoordinateMatcher = BBOX_COORDINATE_PATTERN.matcher(bboxMatcher.group())).matches()) {
            for (int i = 0; i < 4; ++i) {
                String coord = bboxCoordinateMatcher.group(i + 1);
                bbox.add(Float.valueOf(Float.parseFloat(coord)));
            }
        }
        if (bbox.size() == 0) {
            bbox = Arrays.asList(Float.valueOf(0.0f), Float.valueOf(0.0f), Float.valueOf(0.0f), Float.valueOf(0.0f));
            String id = node.attr("id");
            if (id != null && !unparsedBBoxes.containsKey(id)) {
                unparsedBBoxes.put(id, node);
            }
        }
        if (pageBBox == null) {
            return new Rectangle(TesseractHelper.toPoints(((Float)bbox.get(0)).floatValue()), TesseractHelper.toPoints(((Float)bbox.get(1)).floatValue()), TesseractHelper.toPoints(((Float)bbox.get(2)).floatValue()), TesseractHelper.toPoints(((Float)bbox.get(3)).floatValue() - ((Float)bbox.get(1)).floatValue()));
        }
        return new Rectangle(0.0f, 0.0f).setBbox(TesseractHelper.toPoints(((Float)bbox.get(0)).floatValue()), pageBBox.getTop() - TesseractHelper.toPoints(((Float)bbox.get(1)).floatValue()), TesseractHelper.toPoints(((Float)bbox.get(2)).floatValue()), pageBBox.getTop() - TesseractHelper.toPoints(((Float)bbox.get(3)).floatValue()));
    }

    static void detectAndFixBrokenBBoxes(Element object, Rectangle bbox, Rectangle lineBbox, Rectangle pageBbox, Map<String, Node> unparsedBBoxes) {
        Rectangle siblingBBox;
        Element sibling;
        if (bbox.getLeft() < lineBbox.getLeft() || bbox.getLeft() > lineBbox.getRight()) {
            if (object.previousElementSibling() == null) {
                bbox.setX(lineBbox.getLeft());
            } else {
                sibling = object.previousElementSibling();
                siblingBBox = TesseractHelper.parseBBox((Node)sibling, pageBbox, unparsedBBoxes);
                bbox.setX(siblingBBox.getRight());
            }
        }
        if (bbox.getRight() > lineBbox.getRight() || bbox.getRight() < lineBbox.getLeft()) {
            if (object.nextElementSibling() == null) {
                bbox.setBbox(bbox.getLeft(), bbox.getBottom(), lineBbox.getRight(), bbox.getTop());
            } else {
                sibling = object.nextElementSibling();
                siblingBBox = TesseractHelper.parseBBox((Node)sibling, pageBbox, unparsedBBoxes);
                bbox.setBbox(bbox.getLeft(), bbox.getBottom(), siblingBBox.getLeft(), bbox.getTop());
            }
        }
    }

    static float toPixels(float pt) {
        return pt / 0.75f;
    }

    static float toPoints(float px) {
        return px * 0.75f;
    }

    static void deleteFile(String pathToFile) {
        try {
            if (pathToFile != null && !pathToFile.isEmpty() && Files.exists(Paths.get(pathToFile, new String[0]), new LinkOption[0])) {
                Files.delete(Paths.get(pathToFile, new String[0]));
            }
        }
        catch (IOException | SecurityException e) {
            LOGGER.info(MessageFormatUtil.format((String)"File {0} cannot be deleted: {1}", (Object[])new Object[]{pathToFile, e.getMessage()}));
        }
    }

    static String readTxtFile(File txtFile) {
        String content = null;
        try {
            content = new String(Files.readAllBytes(txtFile.toPath()), StandardCharsets.UTF_8);
        }
        catch (IOException e) {
            LOGGER.error(MessageFormatUtil.format((String)"Cannot read file {0}: {1}", (Object[])new Object[]{txtFile.getAbsolutePath(), e.getMessage()}));
        }
        return content;
    }

    static void writeToTextFile(String path, String data) {
        try (OutputStreamWriter writer = new OutputStreamWriter((OutputStream)new FileOutputStream(path), StandardCharsets.UTF_8);){
            writer.write(data);
        }
        catch (IOException e) {
            throw new PdfOcrInputTesseract4Exception("Cannot write to file {0}: {1}", e);
        }
    }

    static void runCommand(String execPath, List<String> paramsList) throws PdfOcrTesseract4Exception {
        TesseractHelper.runCommand(execPath, paramsList, null);
    }

    static void runCommand(String execPath, List<String> paramsList, String workingDirPath) throws PdfOcrTesseract4Exception {
        try {
            String params = String.join((CharSequence)" ", paramsList);
            boolean cmdSucceeded = SystemUtil.runProcessAndWait((String)execPath, (String)params, (String)workingDirPath);
            if (!cmdSucceeded) {
                LOGGER.error(MessageFormatUtil.format((String)"Command failed: {0}", (Object[])new Object[]{execPath + " " + params}));
                throw new PdfOcrTesseract4Exception("Tesseract failed. Please check provided parameters");
            }
        }
        catch (Exception e) {
            LOGGER.error(MessageFormatUtil.format((String)"Command failed: {0}", (Object[])new Object[]{e.getMessage()}));
            throw new PdfOcrTesseract4Exception("Tesseract failed. Please check provided parameters");
        }
    }

    private static List<TextInfo> getTextData(Element page, Tesseract4OcrEngineProperties tesseract4OcrEngineProperties, List<String> txt, Map<String, Node> unparsedBBoxes) {
        Rectangle pageBbox = TesseractHelper.parseBBox((Node)page, null, unparsedBBoxes);
        List<String> searchedClasses = Arrays.asList(OCR_LINE, OCR_CAPTION);
        Elements objects = new Elements();
        for (int i = 0; i < searchedClasses.size(); ++i) {
            Elements foundElements = page.getElementsByClass(searchedClasses.get(i));
            for (int j = 0; j < foundElements.size(); ++j) {
                objects.add(foundElements.get(j));
            }
        }
        return TesseractHelper.getTextData((List<Element>)objects, tesseract4OcrEngineProperties, txt, pageBbox, unparsedBBoxes);
    }

    private static List<TextInfo> getTextData(List<Element> pageObjects, Tesseract4OcrEngineProperties tesseract4OcrEngineProperties, List<String> txt, Rectangle pageBbox, Map<String, Node> unparsedBBoxes) {
        ArrayList<TextInfo> textData = new ArrayList<TextInfo>();
        for (Element lineOrCaption : pageObjects) {
            if (lineOrCaption.text().isEmpty() || !TesseractHelper.isElementConfident(lineOrCaption, tesseract4OcrEngineProperties.getMinimalConfidenceLevel())) continue;
            String hocrLineInTxt = TesseractHelper.findHocrLineInTxt(lineOrCaption, txt);
            if (tesseract4OcrEngineProperties.getTextPositioning() == TextPositioning.BY_WORDS || tesseract4OcrEngineProperties.getTextPositioning() == TextPositioning.BY_WORDS_AND_LINES) {
                for (TextInfo ti : TesseractHelper.getTextDataForWords(lineOrCaption, hocrLineInTxt, tesseract4OcrEngineProperties.getTextPositioning(), pageBbox, unparsedBBoxes)) {
                    textData.add(ti);
                }
                continue;
            }
            for (TextInfo ti : TesseractHelper.getTextDataForLines(lineOrCaption, hocrLineInTxt, pageBbox, unparsedBBoxes)) {
                textData.add(ti);
            }
        }
        return textData;
    }

    private static boolean isElementConfident(Element lineOrCaption, int minimalConfidenceLevel) {
        if (minimalConfidenceLevel == 0) {
            return true;
        }
        int wconfTotal = 0;
        int wconfCount = 0;
        for (Node node : lineOrCaption.childNodes()) {
            String title;
            Matcher matcher;
            if (!(node instanceof Element) || !(matcher = WCONF_PATTERN.matcher(title = ((Element)node).attr(TITLE))).matches()) continue;
            String wconf = null;
            try {
                wconf = matcher.group(1);
            }
            catch (Exception exception) {
                // empty catch block
            }
            if (wconf == null) continue;
            wconf = wconf.replaceAll(X_WCONF, "").trim();
            wconfTotal += Integer.parseInt(wconf);
            ++wconfCount;
        }
        if (wconfCount > 0) {
            return wconfTotal / wconfCount >= minimalConfidenceLevel;
        }
        return true;
    }

    private static List<TextInfo> getTextDataForWords(Element lineOrCaption, String txtLine, TextPositioning textPositioning, Rectangle pageBbox, Map<String, Node> unparsedBBoxes) {
        ArrayList<TextInfo> textData = new ArrayList<TextInfo>();
        if (txtLine == null) {
            for (Element word : lineOrCaption.getElementsByClass(OCRX_WORD)) {
                Rectangle bboxRect = TesseractHelper.getAlignedBBox(word, textPositioning, pageBbox, unparsedBBoxes);
                TesseractHelper.addToTextData(textData, word.text(), bboxRect);
            }
        } else {
            ArrayList<TextInfo> textInfos = new ArrayList<TextInfo>();
            String txtLine1 = txtLine.replaceAll(NEW_LINE_PATTERN, "");
            String txtLine2 = txtLine1.replaceAll(SPACE_PATTERN, " ");
            String[] lineItems = txtLine2.split(" ");
            for (Element word : lineOrCaption.getElementsByClass(OCRX_WORD)) {
                Rectangle bboxRect = TesseractHelper.getAlignedBBox(word, textPositioning, pageBbox, unparsedBBoxes);
                textInfos.add(new TextInfo(word.text(), bboxRect));
                if (!lineItems[0].replaceAll(NEW_LINE_OR_SPACE_PATTERN, "").equals(TesseractHelper.getTextInfosText(textInfos).replaceAll(SPACE_PATTERN, ""))) continue;
                lineItems = Arrays.copyOfRange(lineItems, 1, lineItems.length);
                TesseractHelper.addToTextData(textData, TesseractHelper.mergeTextInfos(textInfos));
                textInfos.clear();
            }
        }
        return textData;
    }

    private static List<TextInfo> getTextDataForLines(Element lineOrCaption, String txtLine, Rectangle pageBbox, Map<String, Node> unparsedBBoxes) {
        ArrayList<TextInfo> textData = new ArrayList<TextInfo>();
        Rectangle bboxRect = TesseractHelper.getAlignedBBox(lineOrCaption, TextPositioning.BY_LINES, pageBbox, unparsedBBoxes);
        if (txtLine == null) {
            TesseractHelper.addToTextData(textData, lineOrCaption.text(), bboxRect);
        } else {
            TesseractHelper.addToTextData(textData, txtLine, bboxRect);
        }
        return textData;
    }

    private static void addToTextData(List<TextInfo> textData, String text, Rectangle bboxRect) {
        TextInfo textInfo = new TextInfo(text, bboxRect);
        textData.add(textInfo);
    }

    private static void addToTextData(List<TextInfo> textData, TextInfo textInfo) {
        String text = textInfo.getText();
        Rectangle bboxRect = textInfo.getBboxRect();
        TesseractHelper.addToTextData(textData, text, bboxRect);
    }

    private static String getTextInfosText(List<TextInfo> textInfos) {
        StringBuilder text = new StringBuilder();
        for (TextInfo textInfo : textInfos) {
            text.append(textInfo.getText());
        }
        return text.toString();
    }

    private static TextInfo mergeTextInfos(List<TextInfo> textInfos) {
        TextInfo textInfo = new TextInfo(textInfos.get(0));
        for (int i = 1; i < textInfos.size(); ++i) {
            textInfo.setText(textInfo.getText() + textInfos.get(i).getText());
            Rectangle leftBBox = textInfo.getBboxRect();
            Rectangle rightBBox = textInfos.get(i).getBboxRect();
            textInfo.setBboxRect(new Rectangle(0.0f, 0.0f).setBbox(leftBBox.getLeft(), Math.min(leftBBox.getBottom(), rightBBox.getBottom()), rightBBox.getRight(), Math.max(leftBBox.getTop(), rightBBox.getTop())));
        }
        return textInfo;
    }

    private static String findHocrLineInTxt(Element line, List<String> txt) {
        if (txt == null) {
            return null;
        }
        String hocrLineText = line.text().replaceAll(SPACE_PATTERN, "");
        if (hocrLineText.isEmpty()) {
            return null;
        }
        for (String txtLine : txt) {
            if (!txtLine.replaceAll(SPACE_PATTERN, "").equals(hocrLineText)) continue;
            return txtLine;
        }
        return null;
    }
}

