/*
 * Decompiled with CFR 0.152.
 */
package org.grobid.trainer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import java.util.StringTokenizer;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.grobid.core.GrobidModels;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.mock.MockContext;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.trainer.AbstractTrainer;
import org.grobid.trainer.sax.TEISegmentationSaxParser;
import org.xml.sax.helpers.DefaultHandler;

public class SegmentationTrainer
extends AbstractTrainer {
    private double epsilon = 1.0E-5;
    private int window = 20;

    public SegmentationTrainer() {
        super(GrobidModels.SEGMENTATION);
    }

    @Override
    public int createCRFPPData(File corpusPath, File outputFile) {
        return this.addFeaturesSegmentation(corpusPath.getAbsolutePath() + "/tei", corpusPath.getAbsolutePath() + "/raw", outputFile, null, 1.0);
    }

    @Override
    public int createCRFPPData(File corpusDir, File trainingOutputPath, File evalOutputPath, double splitRatio) {
        return this.addFeaturesSegmentation(corpusDir.getAbsolutePath() + "/tei", corpusDir.getAbsolutePath() + "/raw", trainingOutputPath, evalOutputPath, splitRatio);
    }

    public int addFeaturesSegmentation(String sourceTEIPathLabel, String sourceRawPathLabel, File trainingOutputPath, File evalOutputPath, double splitRatio) {
        int totalExamples = 0;
        try {
            System.out.println("sourceTEIPathLabel: " + sourceTEIPathLabel);
            System.out.println("sourceRawPathLabel: " + sourceRawPathLabel);
            System.out.println("trainingOutputPath: " + trainingOutputPath);
            System.out.println("evalOutputPath: " + evalOutputPath);
            File input = new File(sourceTEIPathLabel);
            File[] refFiles = input.listFiles(new FilenameFilter(){

                @Override
                public boolean accept(File dir, String name) {
                    return name.endsWith(".tei.xml") || name.endsWith(".tei");
                }
            });
            if (refFiles == null) {
                return 0;
            }
            System.out.println(refFiles.length + " tei files");
            FileOutputStream os2 = null;
            OutputStreamWriter writer2 = null;
            if (trainingOutputPath != null) {
                os2 = new FileOutputStream(trainingOutputPath);
                writer2 = new OutputStreamWriter((OutputStream)os2, "UTF8");
            }
            FileOutputStream os3 = null;
            OutputStreamWriter writer3 = null;
            if (evalOutputPath != null) {
                os3 = new FileOutputStream(evalOutputPath);
                writer3 = new OutputStreamWriter((OutputStream)os3, "UTF8");
            }
            SAXParserFactory spf = SAXParserFactory.newInstance();
            for (File tf : refFiles) {
                String line;
                String name = tf.getName();
                System.out.println(name);
                TEISegmentationSaxParser parser2 = new TEISegmentationSaxParser();
                SAXParser p = spf.newSAXParser();
                p.parse(tf, (DefaultHandler)parser2);
                List<String> labeled = parser2.getLabeledResult();
                File theRawFile = new File(sourceRawPathLabel + "/" + name.replace(".tei.xml", ""));
                if (!theRawFile.exists()) {
                    System.out.println("Raw file " + theRawFile + " does not exist. Please have a look!");
                    continue;
                }
                int q = 0;
                BufferedReader bis = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(sourceRawPathLabel + "/" + name.replace(".tei.xml", "")), "UTF8"));
                StringBuilder segmentation = new StringBuilder();
                block3: while ((line = bis.readLine()) != null) {
                    int ii = line.indexOf(32);
                    String token = null;
                    if (ii != -1) {
                        token = line.substring(0, ii);
                    }
                    for (int pp = q; pp < labeled.size(); ++pp) {
                        String localToken;
                        String localLine = labeled.get(pp);
                        StringTokenizer st = new StringTokenizer(localLine, " \t");
                        if (st.hasMoreTokens() && (localToken = st.nextToken()).equals(token)) {
                            String tag = st.nextToken();
                            segmentation.append(line).append(" ").append(tag);
                            q = pp + 1;
                            pp = q + 10;
                        }
                        if (pp - q > 5) continue block3;
                    }
                }
                bis.close();
                if (writer2 == null && writer3 != null) {
                    writer3.write(segmentation.toString() + "\n");
                }
                if (writer2 != null && writer3 == null) {
                    writer2.write(segmentation.toString() + "\n");
                    continue;
                }
                if (Math.random() <= splitRatio) {
                    writer2.write(segmentation.toString() + "\n");
                    continue;
                }
                writer3.write(segmentation.toString() + "\n");
            }
            if (writer2 != null) {
                ((Writer)writer2).close();
                ((OutputStream)os2).close();
            }
            if (writer3 != null) {
                ((Writer)writer3).close();
                ((OutputStream)os3).close();
            }
        }
        catch (Exception e) {
            throw new GrobidException("An exception occured while running Grobid.", (Throwable)e);
        }
        return totalExamples;
    }

    public int addFeaturesSegmentation2(String sourceTEIPathLabel, String sourceRawPathLabel, File outputPath) {
        int totalExamples = 0;
        try {
            System.out.println("sourceTEIPathLabel: " + sourceTEIPathLabel);
            System.out.println("sourceRawPathLabel: " + sourceRawPathLabel);
            System.out.println("outputPath: " + outputPath);
            File input = new File(sourceTEIPathLabel);
            File[] refFiles = input.listFiles(new FilenameFilter(){

                @Override
                public boolean accept(File dir, String name) {
                    return name.endsWith(".tei.xml");
                }
            });
            if (refFiles == null) {
                return 0;
            }
            System.out.println(refFiles.length + " tei files");
            FileOutputStream os2 = new FileOutputStream(outputPath);
            OutputStreamWriter writer2 = new OutputStreamWriter((OutputStream)os2, "UTF8");
            SAXParserFactory spf = SAXParserFactory.newInstance();
            for (File tf : refFiles) {
                String line;
                String name = tf.getName();
                System.out.println(name);
                TEISegmentationSaxParser parser2 = new TEISegmentationSaxParser();
                SAXParser p = spf.newSAXParser();
                p.parse(tf, (DefaultHandler)parser2);
                List<String> labeled = parser2.getLabeledResult();
                int q = 0;
                BufferedReader bis = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(sourceRawPathLabel + "/" + name.replace(".tei.xml", "")), "UTF8"));
                StringBuilder segmentation = new StringBuilder();
                block3: while ((line = bis.readLine()) != null) {
                    int ii = line.indexOf(32);
                    String token = null;
                    if (ii != -1) {
                        token = line.substring(0, ii);
                    }
                    for (int pp = q; pp < labeled.size(); ++pp) {
                        String localToken;
                        String localLine = labeled.get(pp);
                        StringTokenizer st = new StringTokenizer(localLine, " ");
                        if (st.hasMoreTokens() && (localToken = st.nextToken()).equals(token)) {
                            String tag = st.nextToken();
                            segmentation.append(line).append(" ").append(tag);
                            q = pp + 1;
                            pp = q + 10;
                        }
                        if (pp - q > 5) continue block3;
                    }
                }
                bis.close();
                writer2.write(segmentation.toString() + "\n");
            }
            ((Writer)writer2).close();
            ((OutputStream)os2).close();
        }
        catch (Exception e) {
            throw new GrobidException("An exception occured while running Grobid.", (Throwable)e);
        }
        return totalExamples;
    }

    public static void main(String[] args) throws Exception {
        MockContext.setInitialContext();
        GrobidProperties.getInstance();
        AbstractTrainer.runTraining(new SegmentationTrainer());
        AbstractTrainer.runEvaluation(new SegmentationTrainer());
        MockContext.destroyInitialContext();
    }
}

