/*
 * Decompiled with CFR 0.152.
 */
package org.grobid.trainer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import java.util.StringTokenizer;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.grobid.core.GrobidModels;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.mock.MockContext;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.trainer.AbstractTrainer;
import org.grobid.trainer.sax.TEIReferenceSegmenterSaxParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.helpers.DefaultHandler;

public class ReferenceSegmenterTrainer
extends AbstractTrainer {
    public static final Logger LOGGER = LoggerFactory.getLogger(ReferenceSegmenterTrainer.class);

    public ReferenceSegmenterTrainer() {
        super(GrobidModels.REFERENCE_SEGMENTER);
    }

    @Override
    public int createCRFPPData(File corpusPath, File trainingOutputPath) {
        return this.createCRFPPData(corpusPath, trainingOutputPath, null, 1.0);
    }

    @Override
    public int createCRFPPData(File corpusDir, File trainingOutputPath, File evaluationOutputPath, double splitRatio) {
        int totalExamples = 0;
        try {
            File teiCorpusDir;
            LOGGER.info("Corpus directory: " + corpusDir);
            if (trainingOutputPath != null) {
                LOGGER.info("output path for training data: " + trainingOutputPath);
            }
            if (evaluationOutputPath != null) {
                LOGGER.info("output path for evaluation data: " + evaluationOutputPath);
            }
            if (!(teiCorpusDir = new File(corpusDir.getAbsolutePath() + "/tei/")).exists()) {
                throw new IllegalStateException("Folder " + corpusDir.getAbsolutePath() + " does not exist. Please have a look!");
            }
            File[] refFiles = teiCorpusDir.listFiles(new FilenameFilter(){

                @Override
                public boolean accept(File dir, String name) {
                    return name.endsWith(".xml") || name.endsWith(".tei");
                }
            });
            if (refFiles == null) {
                throw new IllegalStateException("Folder " + teiCorpusDir.getAbsolutePath() + " does not seem to contain training data. Please check");
            }
            LOGGER.info("Processing " + refFiles.length + " tei files");
            FileOutputStream trainingOS = null;
            OutputStreamWriter trainingWriter = null;
            if (trainingOutputPath != null) {
                trainingOS = new FileOutputStream(trainingOutputPath);
                trainingWriter = new OutputStreamWriter((OutputStream)trainingOS, "UTF8");
            }
            FileOutputStream evaluationOS = null;
            OutputStreamWriter evaluationWriter = null;
            if (evaluationOutputPath != null) {
                evaluationOS = new FileOutputStream(evaluationOutputPath);
                evaluationWriter = new OutputStreamWriter((OutputStream)evaluationOS, "UTF8");
            }
            System.out.println("training data under: " + trainingOutputPath);
            System.out.println("evaluation data under: " + evaluationOutputPath);
            SAXParserFactory spf = SAXParserFactory.newInstance();
            for (int n = 0; n < refFiles.length; ++n) {
                String line;
                File teifile = refFiles[n];
                TEIReferenceSegmenterSaxParser saxParser = new TEIReferenceSegmenterSaxParser();
                String name = teifile.getName();
                SAXParser p = spf.newSAXParser();
                p.parse(teifile, (DefaultHandler)saxParser);
                List<String> labeled = saxParser.getLabeledResult();
                totalExamples += saxParser.getTotalReferences();
                File rawCorpusDir = new File(corpusDir.getAbsolutePath() + "/raw/");
                if (!rawCorpusDir.exists()) {
                    throw new IllegalStateException("Folder " + rawCorpusDir.getAbsolutePath() + " does not exist. Please have a look!");
                }
                File theRawFile = new File(rawCorpusDir.getAbsolutePath() + "/" + name.replace(".tei.xml", ""));
                if (!theRawFile.exists()) {
                    System.out.println("Raw file " + theRawFile + " does not exist. Please have a look!");
                    continue;
                }
                int q = 0;
                BufferedReader bis = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(rawCorpusDir.getAbsolutePath() + "/" + name.replace(".tei.xml", "")), "UTF8"));
                StringBuilder referenceText = new StringBuilder();
                block3: while ((line = bis.readLine()) != null) {
                    int ii = line.indexOf(32);
                    String token = null;
                    if (ii != -1) {
                        token = line.substring(0, ii);
                    }
                    for (int pp = q; pp < labeled.size(); ++pp) {
                        String localToken;
                        String localLine = labeled.get(pp);
                        StringTokenizer st = new StringTokenizer(localLine, " ");
                        if (st.hasMoreTokens() && (localToken = st.nextToken()).equals(token)) {
                            String tag = st.nextToken();
                            referenceText.append(line).append(" ").append(tag).append("\n");
                            q = pp + 1;
                            pp = q + 10;
                        }
                        if (pp - q > 5) continue block3;
                    }
                }
                bis.close();
                if (trainingWriter == null && evaluationWriter != null) {
                    evaluationWriter.write(referenceText.toString() + "\n \n");
                }
                if (trainingWriter != null && evaluationWriter == null) {
                    trainingWriter.write(referenceText.toString() + "\n \n");
                    continue;
                }
                if (Math.random() <= splitRatio && trainingWriter != null) {
                    trainingWriter.write(referenceText.toString() + "\n \n");
                    continue;
                }
                if (evaluationWriter == null) continue;
                evaluationWriter.write(referenceText.toString() + "\n \n");
            }
            if (trainingWriter != null) {
                ((Writer)trainingWriter).close();
                ((OutputStream)trainingOS).close();
            }
            if (evaluationWriter != null) {
                ((Writer)evaluationWriter).close();
                ((OutputStream)evaluationOS).close();
            }
        }
        catch (Exception e) {
            throw new GrobidException("An exception occurred while trainining/evaluating reference segmenter model.", (Throwable)e);
        }
        return totalExamples;
    }

    public static void main(String[] args) throws Exception {
        MockContext.setInitialContext();
        GrobidProperties.getInstance();
        AbstractTrainer.runTraining(new ReferenceSegmenterTrainer());
        AbstractTrainer.runEvaluation(new ReferenceSegmenterTrainer());
        MockContext.destroyInitialContext();
    }
}

