/*
 * Decompiled with CFR 0.152.
 */
package edu.umd.cloud9.collection.clue;

import edu.umd.cloud9.collection.clue.ClueCollectionPathConstants;
import edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping;
import edu.umd.cloud9.collection.clue.ClueWarcInputFormat;
import edu.umd.cloud9.collection.clue.ClueWarcRecord;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import java.util.Arrays;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

public class CountClueWarcRecords
extends Configured
implements Tool {
    private static final Logger LOG = Logger.getLogger(CountClueWarcRecords.class);
    public static final String ORIGINAL_OPTION = "original";
    public static final String REPACKED_OPTION = "repacked";
    public static final String PATH_OPTION = "path";
    public static final String MAPPING_OPTION = "docnoMapping";
    public static final String SEGMENT_OPTION = "segment";
    public static final String COUNT_OPTION = "countOutput";

    public int run(String[] args) throws Exception {
        boolean repacked;
        CommandLine cmdline;
        Options options = new Options();
        options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution"));
        options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles"));
        OptionBuilder.withArgName((String)PATH_OPTION);
        OptionBuilder.hasArg();
        OptionBuilder.withDescription((String)"path: base path for 'original', actual path for 'repacked'");
        options.addOption(OptionBuilder.create((String)PATH_OPTION));
        OptionBuilder.withArgName((String)PATH_OPTION);
        OptionBuilder.hasArg();
        OptionBuilder.withDescription((String)"DocnoMapping data path");
        options.addOption(OptionBuilder.create((String)MAPPING_OPTION));
        OptionBuilder.withArgName((String)"num");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription((String)"segment number (required if 'original')");
        options.addOption(OptionBuilder.create((String)SEGMENT_OPTION));
        OptionBuilder.withArgName((String)PATH_OPTION);
        OptionBuilder.hasArg();
        OptionBuilder.withDescription((String)"output file to write the number of records");
        options.addOption(OptionBuilder.create((String)COUNT_OPTION));
        GnuParser parser = new GnuParser();
        try {
            cmdline = parser.parse(options, args);
        }
        catch (ParseException exp) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(((Object)((Object)this)).getClass().getName(), options);
            ToolRunner.printGenericCommandUsage((PrintStream)System.out);
            System.err.println("Error parsing command line: " + exp.getMessage());
            return -1;
        }
        if (cmdline.hasOption(REPACKED_OPTION)) {
            repacked = true;
        } else if (cmdline.hasOption(ORIGINAL_OPTION)) {
            repacked = false;
        } else {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(((Object)((Object)this)).getClass().getName(), options);
            ToolRunner.printGenericCommandUsage((PrintStream)System.out);
            System.err.println("Expecting either -original or -repacked");
            return -1;
        }
        if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION) || !repacked && !cmdline.hasOption(SEGMENT_OPTION)) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp(((Object)((Object)this)).getClass().getName(), options);
            ToolRunner.printGenericCommandUsage((PrintStream)System.out);
            return -1;
        }
        String path = cmdline.getOptionValue(PATH_OPTION);
        String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
        int segment = 1;
        if (!repacked) {
            segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION));
        }
        LOG.info((Object)("Tool name: " + CountClueWarcRecords.class.getSimpleName()));
        LOG.info((Object)(" - repacked: " + repacked));
        LOG.info((Object)(" - path: " + path));
        LOG.info((Object)(" - mapping file: " + mappingFile));
        if (!repacked) {
            LOG.info((Object)(" - segment number: " + segment));
        }
        FileSystem fs = FileSystem.get((Configuration)this.getConf());
        int mapTasks = 10;
        JobConf conf = new JobConf(this.getConf(), CountClueWarcRecords.class);
        conf.setJobName(CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment));
        conf.setNumMapTasks(mapTasks);
        conf.setNumReduceTasks(0);
        if (repacked) {
            for (FileStatus status : fs.listStatus(new Path(path))) {
                FileInputFormat.addInputPath((JobConf)conf, (Path)status.getPath());
            }
        } else {
            ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment);
        }
        DistributedCache.addCacheFile((URI)new URI(mappingFile), (Configuration)conf);
        if (repacked) {
            conf.setInputFormat(SequenceFileInputFormat.class);
        } else {
            conf.setInputFormat(ClueWarcInputFormat.class);
        }
        conf.setOutputFormat(NullOutputFormat.class);
        conf.setMapperClass(MyMapper.class);
        RunningJob job = JobClient.runJob((JobConf)conf);
        Counters counters = job.getCounters();
        int numDocs = (int)((Counters.Counter)counters.findCounter((Enum)Records.PAGES)).getCounter();
        LOG.info((Object)("Read " + numDocs + " docs."));
        if (cmdline.hasOption(COUNT_OPTION)) {
            String f = cmdline.getOptionValue(COUNT_OPTION);
            FSDataOutputStream out = fs.create(new Path(f));
            out.write(new Integer(numDocs).toString().getBytes());
            out.close();
        }
        return 0;
    }

    public static void main(String[] args) throws Exception {
        LOG.info((Object)("Running " + CountClueWarcRecords.class.getCanonicalName() + " with args " + Arrays.toString(args)));
        ToolRunner.run((Tool)new CountClueWarcRecords(), (String[])args);
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private static class MyMapper
    extends MapReduceBase
    implements Mapper<Writable, ClueWarcRecord, Writable, Text> {
        ClueWarcDocnoMapping docMapping = new ClueWarcDocnoMapping();

        private MyMapper() {
        }

        public void configure(JobConf job) {
            try {
                Path[] localFiles = DistributedCache.getLocalCacheFiles((Configuration)job);
                this.docMapping.loadMapping(localFiles[0], (FileSystem)FileSystem.getLocal((Configuration)job));
            }
            catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException("Error initializing DocnoMapping!");
            }
        }

        public void map(Writable key, ClueWarcRecord doc, OutputCollector<Writable, Text> output, Reporter reporter) throws IOException {
            reporter.incrCounter((Enum)Records.TOTAL, 1L);
            String docid = doc.getHeaderMetadataItem("WARC-TREC-ID");
            int docno = this.docMapping.getDocno(docid);
            if (docid != null && docno != -1) {
                reporter.incrCounter((Enum)Records.PAGES, 1L);
            }
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private static enum Records {
        TOTAL,
        PAGES;

    }
}

