/*
 * Decompiled with CFR 0.152.
 */
package edu.umd.hooka;

import edu.umd.hooka.Alignment;
import edu.umd.hooka.Metadata;
import edu.umd.hooka.Phrase;
import edu.umd.hooka.PhrasePair;
import edu.umd.hooka.VocabularyWritable;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;

public class HBitextCompiler {
    static final String OUTPUT_BASENAME = "bitextcomp.outputbasename";
    static final String EN_PATH = "bitextcomp.enpath";
    static final String FR_PATH = "bitextcomp.frpath";
    static final String AL_PATH = "bitextcomp.alpath";

    public static void main(String[] args) {
        JobConf conf = new JobConf(HBitextCompiler.class);
        conf.set(OUTPUT_BASENAME, "/shared/bitexts/ep700k+nc.de-en/ep700k+nc");
        conf.set(FR_PATH, "filt.lc.de");
        conf.set(EN_PATH, "filt.lc.en");
        conf.set(AL_PATH, "");
        conf.setJobName("bitext.compile");
        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapperClass(BitextCompilerMapper.class);
        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(0);
        FileInputFormat.setInputPaths((JobConf)conf, (Path[])new Path[]{new Path("dummy")});
        try {
            FileSystem.get((Configuration)conf).delete(new Path("dummy.out"));
            FileOutputFormat.setOutputPath((JobConf)conf, (Path)new Path("dummy.out"));
            conf.setOutputFormat(SequenceFileOutputFormat.class);
            JobClient.runJob((JobConf)conf);
        }
        catch (IOException e) {
            System.err.println("Caught " + e);
            e.printStackTrace();
        }
    }

    public static class BitextCompilerMapper
    extends MapReduceBase
    implements Mapper<LongWritable, Text, LongWritable, Text> {
        String outputBase = null;
        Path pf = null;
        Path pe = null;
        Path pa = null;

        public void configure(JobConf job) {
            this.outputBase = job.get(HBitextCompiler.OUTPUT_BASENAME);
            this.pe = new Path(job.get(HBitextCompiler.EN_PATH));
            this.pf = new Path(job.get(HBitextCompiler.FR_PATH));
            String alps = job.get(HBitextCompiler.AL_PATH);
            if (alps != null && alps.compareTo("") != 0) {
                this.pa = new Path(alps);
            }
        }

        public void map(LongWritable key, Text value, OutputCollector<LongWritable, Text> oc, Reporter reporter) throws IOException {
            String es;
            Path output = new Path(this.outputBase);
            Path pmd = new Path(this.outputBase + ".metadata");
            Configuration conf = new Configuration();
            FileSystem fileSys = FileSystem.get((Configuration)conf);
            VocabularyWritable vocE = new VocabularyWritable();
            VocabularyWritable vocF = new VocabularyWritable();
            SequenceFile.Writer sfw = SequenceFile.createWriter((FileSystem)fileSys, (Configuration)conf, (Path)output, IntWritable.class, PhrasePair.class);
            boolean hasAlignment = this.pa != null;
            BufferedReader rde = new BufferedReader(new InputStreamReader((InputStream)fileSys.open(this.pe), "UTF8"));
            BufferedReader rdf = new BufferedReader(new InputStreamReader((InputStream)fileSys.open(this.pf), "UTF8"));
            BufferedReader rda = null;
            if (hasAlignment) {
                rda = new BufferedReader(new InputStreamReader((InputStream)fileSys.open(this.pa), "UTF8"));
            }
            IntWritable lci = new IntWritable(0);
            int lc = 0;
            reporter.incrCounter((Enum)BitextCompilerCounters.ENCODING_ERRORS, 0L);
            while ((es = rde.readLine()) != null) {
                if (++lc % 100 == 0) {
                    reporter.progress();
                }
                reporter.incrCounter((Enum)BitextCompilerCounters.LINES, 1L);
                String fs = rdf.readLine();
                if (fs == null) {
                    throw new RuntimeException(this.pf + " has fewer lines than " + this.pe);
                }
                try {
                    Phrase e = Phrase.fromString(0, es, vocE);
                    Phrase f = Phrase.fromString(1, fs, vocF);
                    PhrasePair b = new PhrasePair(f, e);
                    if (hasAlignment) {
                        Alignment a = new Alignment(f.size(), e.size(), rda.readLine());
                        b.setAlignment(a);
                    }
                    lci.set(lc);
                    sfw.append((Writable)lci, (Writable)b);
                    reporter.incrCounter((Enum)BitextCompilerCounters.EN_WORDS, (long)e.getWords().length);
                    reporter.incrCounter((Enum)BitextCompilerCounters.FR_WORDS, (long)f.getWords().length);
                    reporter.progress();
                }
                catch (Exception e) {
                    System.err.println("\nAt line " + lc + " caught: " + e);
                    reporter.incrCounter((Enum)BitextCompilerCounters.ENCODING_ERRORS, 1L);
                }
            }
            if (rdf.readLine() != null) {
                throw new RuntimeException(this.pf + " has more lines than " + this.pe);
            }
            sfw.close();
            Path pve = new Path(this.outputBase + ".voc.e");
            DataOutputStream dos = new DataOutputStream(new BufferedOutputStream((OutputStream)fileSys.create(pve)));
            vocE.write(dos);
            dos.close();
            Path pvf = new Path(this.outputBase + ".voc.f");
            dos = new DataOutputStream(new BufferedOutputStream((OutputStream)fileSys.create(pvf)));
            vocF.write(dos);
            dos.close();
            Metadata theMetadata = new Metadata(lc, vocE.size(), vocF.size());
            ObjectOutputStream mdstream = new ObjectOutputStream(new BufferedOutputStream((OutputStream)fileSys.create(pmd)));
            mdstream.writeObject(theMetadata);
            mdstream.close();
            oc.collect((Object)new LongWritable(0L), (Object)new Text("done"));
        }
    }

    static enum BitextCompilerCounters {
        EN_WORDS,
        FR_WORDS,
        LINES,
        ENCODING_ERRORS;

    }
}

