/*
 * Decompiled with CFR 0.152.
 */
package edu.umd.hooka;

import edu.umd.hooka.AlignmentWordPreprocessor;
import edu.umd.hooka.Phrase;
import edu.umd.hooka.PhrasePair;
import edu.umd.hooka.Vocab;
import edu.umd.hooka.VocabServer;
import edu.umd.hooka.VocabServerClient;
import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.aer.ReferenceAlignment;
import edu.umd.hooka.corpora.Chunk;
import edu.umd.hooka.corpora.Language;
import edu.umd.hooka.corpora.LanguagePair;
import edu.umd.hooka.corpora.ParallelChunk;
import edu.umd.hooka.corpora.ParallelCorpusReader;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.streaming.StreamXmlRecordReader;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

public class CorpusVocabNormalizerAndNumberizer {
    private static final Logger sLogger = Logger.getLogger(CorpusVocabNormalizerAndNumberizer.class);
    static final String SRC_LANG = "ha.sourcelang";
    static final String TGT_LANG = "ha.targetlang";

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void preprocessAndNumberizeFiles(Configuration c, String inputPaths, Path output) throws IOException {
        sLogger.setLevel(Level.INFO);
        JobConf conf = new JobConf(c);
        conf.setJobName("bitext.compile");
        boolean useVocabServer = false;
        Thread vst1 = null;
        Thread vst2 = null;
        VocabServer vocabServer1 = null;
        VocabServer vocabServer2 = null;
        try {
            conf.setOutputKeyClass(Text.class);
            conf.setOutputValueClass(PhrasePair.class);
            conf.setMapperClass(BitextCompilerMapper.class);
            conf.setReducerClass(IdentityReducer.class);
            conf.setNumMapTasks(1);
            conf.setNumReduceTasks(1);
            FileInputFormat.setInputPaths((JobConf)conf, (String)inputPaths);
            conf.set("stream.recordreader.begin", "<pchunk");
            conf.set("stream.recordreader.end", "</pchunk>");
            conf.set("stream.recordreader.slowmatch", "false");
            conf.set("stream.recordreader.maxrec", "100000");
            conf.setInputFormat(XMLInput.class);
            FileOutputFormat.setOutputPath((JobConf)conf, (Path)output);
            conf.setOutputFormat(SequenceFileOutputFormat.class);
            conf.setJarByClass(CorpusVocabNormalizerAndNumberizer.class);
            System.out.println("Running job " + conf.getJobName());
            System.out.println("Input: " + inputPaths);
            System.out.println("Output: " + output);
            JobClient.runJob((JobConf)conf);
        }
        finally {
            try {
                if (vst1 != null) {
                    vocabServer1.stopServer();
                }
                if (vst2 != null) {
                    vocabServer2.stopServer();
                }
                if (vst1 != null) {
                    vst1.join();
                }
                if (vst2 != null) {
                    vst2.join();
                }
            }
            catch (InterruptedException e) {}
        }
    }

    public static void main(String[] args) {
        Path[] files = new Path[]{new Path("/Users/redpony/bitexts/man-align/deen.ccb_jhu.xml"), new Path("/tmp/bar.xml")};
        try {
            Configuration c = new Configuration();
            c.set(SRC_LANG, "de");
            c.set(TGT_LANG, "en");
            CorpusVocabNormalizerAndNumberizer.preprocessAndNumberizeFiles(c, "/umd-lin/fture/mt/eu-nc-wmt2008.de-en.xml", new Path("/umd-lin/fture/mt/aligner/comp-bitext"));
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    public static class XMLInput
    extends FileInputFormat<Text, Text> {
        private CompressionCodecFactory compressionCodecs = null;

        public void configure(JobConf conf) {
            this.compressionCodecs = new CompressionCodecFactory((Configuration)conf);
        }

        protected boolean isSplitable(FileSystem fs, Path file) {
            if (this.compressionCodecs == null) {
                return true;
            }
            return this.compressionCodecs.getCodec(file) == null;
        }

        public RecordReader<Text, Text> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {
            reporter.setStatus(genericSplit.toString());
            FileSplit split = (FileSplit)genericSplit;
            Path file = split.getPath();
            FileSystem fs = file.getFileSystem((Configuration)job);
            FSDataInputStream fileIn = fs.open(split.getPath());
            if (this.compressionCodecs != null && this.compressionCodecs.getCodec(file) != null) {
                throw new RuntimeException("Not handling compression!");
            }
            return new StreamXmlRecordReader(fileIn, split, reporter, job, FileSystem.get((Configuration)job));
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    public static class BitextCompilerMapper
    extends MapReduceBase
    implements Mapper<Text, Text, Text, PhrasePair> {
        String outputBase = null;
        Path pf = null;
        Path pe = null;
        Path pa = null;
        static Vocab vocE = null;
        static Vocab vocF = null;
        ParallelCorpusReader pcr = new ParallelCorpusReader();
        Language src = null;
        Language tgt = null;
        AlignmentWordPreprocessor sawp = null;
        AlignmentWordPreprocessor tawp = null;
        LanguagePair lp = null;
        JobConf job_ = null;
        Text ok = new Text("");

        public void configure(JobConf job) {
            sLogger.setLevel(Level.OFF);
            this.src = Language.languageForISO639_1(job.get(CorpusVocabNormalizerAndNumberizer.SRC_LANG));
            this.tgt = Language.languageForISO639_1(job.get(CorpusVocabNormalizerAndNumberizer.TGT_LANG));
            sLogger.debug((Object)("Source language: " + this.src.code()));
            sLogger.debug((Object)("Target language: " + this.tgt.code()));
            boolean useVocabServer = false;
            if (!useVocabServer) {
                if (vocE == null) {
                    vocE = new VocabularyWritable();
                }
                if (vocF == null) {
                    vocF = new VocabularyWritable();
                }
            } else {
                try {
                    vocE = new VocabServerClient(job.get("ha.vocabserver.host"), Integer.parseInt(job.get("ha.vocabserver.port1")));
                    vocF = new VocabServerClient(job.get("ha.vocabserver.host"), Integer.parseInt(job.get("ha.vocabserver.port2")));
                }
                catch (IOException e) {
                    e.printStackTrace();
                    throw new RuntimeException(e);
                }
            }
            this.lp = LanguagePair.languageForISO639_1Pair(this.src.code() + "-" + this.tgt.code());
            if (job.getBoolean("ha.trunc.use", true)) {
                this.sawp = AlignmentWordPreprocessor.CreatePreprocessor(this.lp, this.src, (Configuration)job);
                this.tawp = AlignmentWordPreprocessor.CreatePreprocessor(this.lp, this.tgt, (Configuration)job);
            } else {
                this.sawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, (Configuration)job);
                this.tawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, (Configuration)job);
            }
            this.job_ = job;
        }

        public int[] convertStrings(String[] s, Vocab v) {
            int[] res = new int[s.length];
            for (int i = 0; i < s.length; ++i) {
                res[i] = v.addOrGet(s[i]);
                sLogger.info((Object)(s[i] + "-->" + res[i]));
            }
            return res;
        }

        public void close() {
            System.err.println("Target: " + vocE.size() + " types. Writing to " + this.job_.get("root", null) + "/vocab.E");
            System.err.println("Source: " + vocF.size() + " types .Writing to " + this.job_.get("root", null) + "/vocab.F");
            try {
                FileSystem fs = FileSystem.get((Configuration)this.job_);
                DataOutputStream dos = new DataOutputStream(new BufferedOutputStream((OutputStream)fs.create(new Path(this.job_.get("root", null) + "/vocab.E"))));
                ((VocabularyWritable)vocE).write(dos);
                dos.close();
                DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream((OutputStream)fs.create(new Path(this.job_.get("root", null) + "/vocab.F"))));
                ((VocabularyWritable)vocF).write(dos2);
                dos2.close();
            }
            catch (IOException e) {
                throw new RuntimeException("Vocab couldn't be written to disk.\n" + e.toString());
            }
        }

        public void map(Text key, Text value, OutputCollector<Text, PhrasePair> oc, Reporter reporter) throws IOException {
            ParallelChunk c = this.pcr.parseString(key.toString());
            this.ok.set(c.idString());
            Chunk fc = c.getChunk(this.src);
            Chunk ec = c.getChunk(this.tgt);
            if (fc == null || ec == null) {
                reporter.incrCounter((Enum)BitextCompilerCounters.WRONG_LANGUAGE, 1L);
                return;
            }
            if (fc.getLength() > 200) {
                reporter.incrCounter((Enum)BitextCompilerCounters.SRC_TOO_LONG, 1L);
                return;
            }
            if (ec.getLength() > 200) {
                reporter.incrCounter((Enum)BitextCompilerCounters.TGT_TOO_LONG, 1L);
                return;
            }
            sLogger.debug((Object)"Target sentence:");
            int[] ee = this.convertStrings(this.tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
            sLogger.debug((Object)"Source sentence:");
            int[] fe = this.convertStrings(this.sawp.preprocessWordsForAlignment(fc.getWords()), vocF);
            Phrase e = new Phrase(ee, 0);
            Phrase f = new Phrase(fe, 1);
            PhrasePair b = new PhrasePair(f, e);
            ReferenceAlignment ra = c.getReferenceAlignment(this.lp);
            if (ra != null) {
                b.setAlignment(ra);
            }
            reporter.incrCounter((Enum)BitextCompilerCounters.EN_WORDS, (long)e.getWords().length);
            reporter.incrCounter((Enum)BitextCompilerCounters.FR_WORDS, (long)f.getWords().length);
            reporter.incrCounter((Enum)BitextCompilerCounters.CHUNKS, 1L);
            oc.collect((Object)this.ok, (Object)b);
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    static enum BitextCompilerCounters {
        EN_WORDS,
        FR_WORDS,
        CHUNKS,
        WRONG_LANGUAGE,
        SRC_TOO_LONG,
        TGT_TOO_LONG;

    }
}

