/*
 * Decompiled with CFR 0.152.
 */
package edu.umd.cloud9.webgraph.driver;

import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.webgraph.DriverUtil;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.IndexableAnchorText;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

public class BuildIndexableAnchorCollection
extends Configured
implements Tool {
    private static final Logger LOG = Logger.getLogger(BuildIndexableAnchorCollection.class);

    private static int printUsage() {
        System.out.println("usage: [-input collection-path] [-output output-path] [-docnoClass docno-mapping-class] [-docno docno-mapping-file] [-numReducers num-reducers] [optional:-maxLength maximum content length]");
        ToolRunner.printGenericCommandUsage((PrintStream)System.out);
        return -1;
    }

    public int run(String[] args) throws Exception {
        if (args.length < 5) {
            BuildIndexableAnchorCollection.printUsage();
            return -1;
        }
        JobConf conf = new JobConf(this.getConf());
        FileSystem fs = FileSystem.get((Configuration)conf);
        String collectionPath = DriverUtil.argValue(args, "-input");
        String outputPath = DriverUtil.argValue(args, "-output");
        String docnoMappingClass = DriverUtil.argValue(args, "-docnoClass");
        String docnoMapping = DriverUtil.argValue(args, "-docno");
        int numReducers = Integer.parseInt(DriverUtil.argValue(args, "-numReducers"));
        if (DriverUtil.argExists(args, "-maxLength")) {
            conf.setInt("Cloud9.maxContentLength", Integer.parseInt(DriverUtil.argValue(args, "-maxLength")));
        }
        conf.set("Cloud9.DocnoMappingClass", docnoMappingClass);
        LOG.info((Object)"Tool name: BuildAnchorTextForwardIndex");
        LOG.info((Object)(" - collection path: " + collectionPath));
        LOG.info((Object)(" - output path: " + outputPath));
        LOG.info((Object)(" - docno-mapping class: " + docnoMappingClass));
        LOG.info((Object)(" - docno-mapping file: " + docnoMapping));
        if (args.length == 6) {
            LOG.info((Object)(" - maximum content length: " + conf.getInt("Cloud9.maxContentLength", 0)));
        }
        conf.set("mapred.child.java.opts", "-Xmx2048m");
        conf.setJobName("BuildIndexableAnchorCollection");
        conf.setJarByClass(BuildIndexableAnchorCollection.class);
        conf.setNumMapTasks(100);
        conf.setNumReduceTasks(numReducers);
        DistributedCache.addCacheFile((URI)new URI(docnoMapping), (Configuration)conf);
        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput((JobConf)conf, (boolean)true);
        SequenceFileOutputFormat.setOutputCompressionType((JobConf)conf, (SequenceFile.CompressionType)SequenceFile.CompressionType.BLOCK);
        SequenceFileInputFormat.setInputPaths((JobConf)conf, (Path[])new Path[]{new Path(collectionPath)});
        SequenceFileOutputFormat.setOutputPath((JobConf)conf, (Path)new Path(outputPath));
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(IndexableAnchorText.class);
        conf.setMapperClass(MyMapper.class);
        conf.setReducerClass(IdentityReducer.class);
        fs.delete(new Path(outputPath), true);
        RunningJob job = JobClient.runJob((JobConf)conf);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)new Configuration(), (Tool)new BuildIndexableAnchorCollection(), (String[])args);
        System.exit(res);
    }

    public static class MyMapper
    extends MapReduceBase
    implements Mapper<IntWritable, ArrayListWritable<AnchorText>, IntWritable, IndexableAnchorText> {
        private static final IndexableAnchorText sOutputValue = new IndexableAnchorText();
        private static DocnoMapping docnoMapping;
        private static int maxContentLength;

        public void configure(JobConf job) {
            Path[] localFiles;
            maxContentLength = job.getInt("Cloud9.maxContentLength", 0);
            String docnoMappingClass = job.get("Cloud9.DocnoMappingClass", "edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping");
            try {
                docnoMapping = (DocnoMapping)Class.forName(docnoMappingClass).newInstance();
            }
            catch (Exception e) {
                throw new RuntimeException("Class " + docnoMappingClass + " not found!");
            }
            try {
                localFiles = DistributedCache.getLocalCacheFiles((Configuration)job);
            }
            catch (IOException e) {
                throw new RuntimeException("Local cache files not read properly.");
            }
            try {
                docnoMapping.loadMapping(localFiles[0], (FileSystem)FileSystem.getLocal((Configuration)job));
            }
            catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException("Error initializing DocnoMapping!");
            }
        }

        public void map(IntWritable key, ArrayListWritable<AnchorText> value, OutputCollector<IntWritable, IndexableAnchorText> output, Reporter reporter) throws IOException {
            sOutputValue.clear();
            sOutputValue.setDocid(docnoMapping.getDocid(key.get()));
            if (maxContentLength > 0) {
                sOutputValue.concatenateAnchors(value, maxContentLength);
            } else {
                sOutputValue.concatenateAnchors(value);
            }
            output.collect((Object)key, (Object)sOutputValue);
        }
    }
}

