/*
 * Decompiled with CFR 0.152.
 */
package org.grobid.trainer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.LinkedList;
import java.util.List;
import java.util.StringTokenizer;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.grobid.core.GrobidModels;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeaturesVectorReference;
import org.grobid.core.mock.MockContext;
import org.grobid.core.sax.MarecSaxParser;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.trainer.AbstractTrainer;
import org.grobid.trainer.GenericTrainer;
import org.grobid.trainer.TrainerFactory;
import org.grobid.trainer.evaluation.PatentEvaluation;
import org.xml.sax.helpers.DefaultHandler;

public class PatentParserTrainer
extends AbstractTrainer {
    private double epsilon = 1.0E-4;
    private int window = 20;
    private static final int trainWindow = 200;

    public PatentParserTrainer() {
        super(GrobidModels.PATENT_PATENT);
    }

    public int createTrainingData(String trainingDataDir) {
        int nb = 0;
        try {
            String path = new File(new File(PatentParserTrainer.getFilePath2Resources(), "dataset/patent/corpus/").getAbsolutePath()).getAbsolutePath();
            this.createDataSet(null, null, path, trainingDataDir, 0);
        }
        catch (Exception e) {
            throw new GrobidException("An exception occurred while training Grobid.", (Throwable)e);
        }
        return nb;
    }

    @Override
    public int createCRFPPData(File corpusPath, File outputFile) {
        return 0;
    }

    @Override
    public int createCRFPPData(File corpusPath, File outputTrainingFile, File outputEvalFile, double splitRatio) {
        return 0;
    }

    @Override
    public void train() {
        this.createTrainingData(GrobidProperties.getTempPath().getAbsolutePath());
        File trainingDataPath3 = new File(GrobidProperties.getTempPath() + "/all.train");
        File templatePath3 = new File(PatentParserTrainer.getFilePath2Resources(), "dataset/patent/crfpp-templates/text.references.template");
        GenericTrainer trainer = TrainerFactory.getTrainer();
        trainer.setEpsilon(this.epsilon);
        trainer.setWindow(this.window);
        File modelPath3 = new File(GrobidProperties.getModelPath((GrobidModels)GrobidModels.PATENT_ALL).getAbsolutePath() + ".new");
        trainer.train(templatePath3, trainingDataPath3, modelPath3, GrobidProperties.getNBThreads(), this.model);
        this.renameModels(GrobidProperties.getModelPath((GrobidModels)GrobidModels.PATENT_ALL), modelPath3);
    }

    public void createDataSet(String setName, String rank, String corpusPath, String outputPath, int type) {
        int nbFiles = 0;
        int nbNPLRef = 0;
        int nbPatentRef = 0;
        int maxRef = 0;
        try {
            MarecSaxParser sax = new MarecSaxParser();
            sax.patentReferences = true;
            sax.nplReferences = false;
            int srCitations = 0;
            int previousSrCitations = 0;
            int withSR = 0;
            List journalsPositions = null;
            List abbrevJournalsPositions = null;
            List conferencesPositions = null;
            List publishersPositions = null;
            if (type == 0) {
                sax.setN(200);
            } else {
                sax.setN(-1);
            }
            sax = new MarecSaxParser();
            sax.patentReferences = true;
            sax.nplReferences = true;
            if (type == 0) {
                sax.setN(200);
            } else {
                sax.setN(-1);
            }
            SAXParserFactory spf = SAXParserFactory.newInstance();
            spf.setValidating(false);
            spf.setFeature("http://xml.org/sax/features/namespaces", false);
            spf.setFeature("http://xml.org/sax/features/validation", false);
            LinkedList<File> fileList = new LinkedList<File>();
            if (setName == null) {
                fileList.add(new File(corpusPath));
            } else if (rank == null) {
                fileList.add(new File(corpusPath));
            } else {
                fileList.add(new File(corpusPath + "/" + setName + "ing" + rank + "/"));
            }
            OutputStreamWriter writer = null;
            writer = setName == null || setName.length() == 0 ? new OutputStreamWriter((OutputStream)new FileOutputStream(new File(outputPath + "/all.train"), false), "UTF-8") : (rank == null ? new OutputStreamWriter((OutputStream)new FileOutputStream(new File(outputPath + "/all." + setName), false), "UTF-8") : new OutputStreamWriter((OutputStream)new FileOutputStream(new File(outputPath + "/" + setName + "ing" + rank + "/all." + setName), false), "UTF-8"));
            while (fileList.size() > 0) {
                File file = (File)fileList.removeFirst();
                if (file.isDirectory()) {
                    for (File subFile : file.listFiles()) {
                        fileList.addLast(subFile);
                    }
                    continue;
                }
                if (!file.getName().endsWith(".xml")) continue;
                ++nbFiles;
                try {
                    SAXParser p = spf.newSAXParser();
                    FileInputStream in = new FileInputStream(file);
                    sax.setFileName(file.toString());
                    p.parse((InputStream)in, (DefaultHandler)sax);
                    nbNPLRef += sax.getNbNPLRef();
                    nbPatentRef += sax.getNbPatentRef();
                    if (sax.nbAllRef > maxRef) {
                        maxRef = sax.nbAllRef;
                    }
                    if (sax.citations != null && sax.citations.size() > previousSrCitations) {
                        previousSrCitations = sax.citations.size();
                        ++withSR;
                    }
                    journalsPositions = sax.journalsPositions;
                    abbrevJournalsPositions = sax.abbrevJournalsPositions;
                    conferencesPositions = sax.conferencesPositions;
                    publishersPositions = sax.publishersPositions;
                    if (sax.accumulatedText == null) continue;
                    String text = sax.accumulatedText.toString();
                    this.addFeatures(text, writer, journalsPositions, abbrevJournalsPositions, conferencesPositions, publishersPositions);
                    writer.write("\n");
                }
                catch (Exception e) {
                    throw new GrobidException("An exception occured while running Grobid.", (Throwable)e);
                }
            }
            if (sax.citations != null) {
                srCitations += sax.citations.size();
            }
            if (setName != null) {
                System.out.println(setName + "ing on " + nbFiles + " files");
            } else {
                System.out.println("training on " + nbFiles + " files");
            }
            System.out.println("Number of references: " + (nbNPLRef + nbPatentRef));
            System.out.println("Number of patent references: " + nbPatentRef);
            System.out.println("Number of NPL references: " + nbNPLRef);
            System.out.println("Average number of references: " + TextUtilities.formatTwoDecimals((double)((double)(nbNPLRef + nbPatentRef) / (double)nbFiles)));
            System.out.println("Max number of references in file: " + maxRef);
            if (setName == null || setName.length() == 0) {
                System.out.println("common data set under: " + outputPath + "/all.train");
            } else {
                System.out.println("common data set under: " + outputPath + "/all." + setName);
            }
        }
        catch (Exception e) {
            throw new GrobidException("An exception occurred while running Grobid.", (Throwable)e);
        }
    }

    public void addFeatures(String text, Writer writer, List<OffsetPosition> journalPositions, List<OffsetPosition> abbrevJournalPositions, List<OffsetPosition> conferencePositions, List<OffsetPosition> publisherPositions) {
        try {
            StringTokenizer st = new StringTokenizer(text, "\n");
            int totalLine = st.countTokens();
            int posit = 0;
            int currentJournalPositions = 0;
            int currentAbbrevJournalPositions = 0;
            int currentConferencePositions = 0;
            int currentPublisherPositions = 0;
            while (st.hasMoreTokens()) {
                int i;
                boolean isJournalToken = false;
                boolean isAbbrevJournalToken = false;
                boolean isConferenceToken = false;
                boolean isPublisherToken = false;
                boolean skipTest = false;
                String line = st.nextToken();
                if (line.trim().length() == 0) {
                    writer.write("\n");
                    posit = 0;
                    continue;
                }
                if (line.endsWith("\t<ignore>")) {
                    ++posit;
                    continue;
                }
                if (journalPositions != null) {
                    if (currentJournalPositions == journalPositions.size() - 1 && journalPositions.get((int)currentJournalPositions).end < posit) {
                        skipTest = true;
                    }
                    if (!skipTest) {
                        for (i = currentJournalPositions; i < journalPositions.size(); ++i) {
                            if (journalPositions.get((int)i).start <= posit && journalPositions.get((int)i).end >= posit) {
                                isJournalToken = true;
                                currentJournalPositions = i;
                                break;
                            }
                            if (journalPositions.get((int)i).start <= posit) continue;
                            isJournalToken = false;
                            currentJournalPositions = i;
                            break;
                        }
                    }
                }
                skipTest = false;
                if (abbrevJournalPositions != null) {
                    if (currentAbbrevJournalPositions == abbrevJournalPositions.size() - 1 && abbrevJournalPositions.get((int)currentAbbrevJournalPositions).end < posit) {
                        skipTest = true;
                    }
                    if (!skipTest) {
                        for (i = currentAbbrevJournalPositions; i < abbrevJournalPositions.size(); ++i) {
                            if (abbrevJournalPositions.get((int)i).start <= posit && abbrevJournalPositions.get((int)i).end >= posit) {
                                isAbbrevJournalToken = true;
                                currentAbbrevJournalPositions = i;
                                break;
                            }
                            if (abbrevJournalPositions.get((int)i).start <= posit) continue;
                            isAbbrevJournalToken = false;
                            currentAbbrevJournalPositions = i;
                            break;
                        }
                    }
                }
                skipTest = false;
                if (conferencePositions != null) {
                    if (currentConferencePositions == conferencePositions.size() - 1 && conferencePositions.get((int)currentConferencePositions).end < posit) {
                        skipTest = true;
                    }
                    if (!skipTest) {
                        for (i = currentConferencePositions; i < conferencePositions.size(); ++i) {
                            if (conferencePositions.get((int)i).start <= posit && conferencePositions.get((int)i).end >= posit) {
                                isConferenceToken = true;
                                currentConferencePositions = i;
                                break;
                            }
                            if (conferencePositions.get((int)i).start <= posit) continue;
                            isConferenceToken = false;
                            currentConferencePositions = i;
                            break;
                        }
                    }
                }
                skipTest = false;
                if (publisherPositions != null) {
                    if (currentPublisherPositions == publisherPositions.size() - 1 && publisherPositions.get((int)currentPublisherPositions).end < posit) {
                        skipTest = true;
                    }
                    if (!skipTest) {
                        for (i = currentPublisherPositions; i < publisherPositions.size(); ++i) {
                            if (publisherPositions.get((int)i).start <= posit && publisherPositions.get((int)i).end >= posit) {
                                isPublisherToken = true;
                                currentPublisherPositions = i;
                                break;
                            }
                            if (publisherPositions.get((int)i).start <= posit) continue;
                            isPublisherToken = false;
                            currentPublisherPositions = i;
                            break;
                        }
                    }
                }
                FeaturesVectorReference featuresVector = FeaturesVectorReference.addFeaturesPatentReferences((String)line, (int)totalLine, (int)posit, (boolean)isJournalToken, (boolean)isAbbrevJournalToken, (boolean)isConferenceToken, (boolean)isPublisherToken);
                if (featuresVector.label == null) continue;
                writer.write(featuresVector.printVector());
                writer.flush();
                ++posit;
            }
        }
        catch (Exception e) {
            throw new GrobidException("An exception occurred while running Grobid.", (Throwable)e);
        }
    }

    @Override
    public String evaluate() {
        return new PatentEvaluation().evaluate();
    }

    public static void main(String[] args) throws Exception {
        MockContext.setInitialContext();
        GrobidProperties.getInstance();
        AbstractTrainer.runTraining(new PatentParserTrainer());
        MockContext.destroyInitialContext();
    }
}

