/*
 * Decompiled with CFR 0.152.
 */
package edu.umd.cloud9.collection.clue;

import edu.umd.cloud9.collection.clue.ClueCollectionPathConstants;
import edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping;
import edu.umd.cloud9.collection.clue.ClueWarcInputFormat;
import edu.umd.cloud9.collection.clue.ClueWarcRecord;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

public class RepackClueWarcRecords
extends Configured
implements Tool {
    private static final Logger LOG = Logger.getLogger(RepackClueWarcRecords.class);

    private static int printUsage() {
        System.out.println("usage: [base-path] [output-path] [segment-num] [docno-mapping-data-file] (block|record|none)");
        ToolRunner.printGenericCommandUsage((PrintStream)System.out);
        return -1;
    }

    public int run(String[] args) throws Exception {
        if (args.length != 5) {
            RepackClueWarcRecords.printUsage();
            return -1;
        }
        String basePath = args[0];
        String outputPath = args[1];
        int segment = Integer.parseInt(args[2]);
        String data = args[3];
        String compressionType = args[4];
        if (!(compressionType.equals("block") || compressionType.equals("record") || compressionType.equals("none"))) {
            System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
            System.exit(-1);
        }
        int blocksize = 1000000;
        JobConf conf = new JobConf(RepackClueWarcRecords.class);
        conf.setJobName("RepackClueWarcRecords:segment" + segment);
        conf.set("DocnoMappingDataFile", data);
        LOG.info((Object)"Tool name: RepackClueWarcRecords");
        LOG.info((Object)(" - base path: " + basePath));
        LOG.info((Object)(" - output path: " + outputPath));
        LOG.info((Object)(" - segment number: " + segment));
        LOG.info((Object)(" - docno mapping data file: " + data));
        LOG.info((Object)(" - compression type: " + compressionType));
        if (compressionType.equals("block")) {
            LOG.info((Object)(" - block size: " + blocksize));
        }
        int mapTasks = 10;
        conf.setNumMapTasks(mapTasks);
        conf.setNumReduceTasks(0);
        ClueCollectionPathConstants.addEnglishCollectionPart(conf, basePath, segment);
        SequenceFileOutputFormat.setOutputPath((JobConf)conf, (Path)new Path(outputPath));
        if (compressionType.equals("none")) {
            SequenceFileOutputFormat.setCompressOutput((JobConf)conf, (boolean)false);
        } else {
            SequenceFileOutputFormat.setCompressOutput((JobConf)conf, (boolean)true);
            if (compressionType.equals("record")) {
                SequenceFileOutputFormat.setOutputCompressionType((JobConf)conf, (SequenceFile.CompressionType)SequenceFile.CompressionType.RECORD);
            } else {
                SequenceFileOutputFormat.setOutputCompressionType((JobConf)conf, (SequenceFile.CompressionType)SequenceFile.CompressionType.BLOCK);
                conf.setInt("io.seqfile.compress.blocksize", blocksize);
            }
        }
        conf.setInputFormat(ClueWarcInputFormat.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(ClueWarcRecord.class);
        conf.setMapperClass(MyMapper.class);
        FileSystem.get((Configuration)conf).delete(new Path(outputPath), true);
        JobClient.runJob((JobConf)conf);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        LOG.info((Object)("Running " + RepackClueWarcRecords.class.getCanonicalName() + " with args " + Arrays.toString(args)));
        ToolRunner.run((Tool)new RepackClueWarcRecords(), (String[])args);
    }

    private static class MyMapper
    extends MapReduceBase
    implements Mapper<LongWritable, ClueWarcRecord, IntWritable, ClueWarcRecord> {
        private static final IntWritable DOCNO = new IntWritable();
        private ClueWarcDocnoMapping docnoMapping = new ClueWarcDocnoMapping();

        private MyMapper() {
        }

        public void configure(JobConf job) {
            try {
                this.docnoMapping.loadMapping(new Path(job.get("DocnoMappingDataFile")), FileSystem.get((Configuration)job));
            }
            catch (Exception e) {
                throw new RuntimeException("Error loading docno mapping data file!");
            }
        }

        public void map(LongWritable key, ClueWarcRecord doc, OutputCollector<IntWritable, ClueWarcRecord> output, Reporter reporter) throws IOException {
            reporter.incrCounter((Enum)Records.TOTAL, 1L);
            String id = doc.getHeaderMetadataItem("WARC-TREC-ID");
            if (id != null) {
                reporter.incrCounter((Enum)Records.PAGES, 1L);
                DOCNO.set(this.docnoMapping.getDocno(id));
                output.collect((Object)DOCNO, (Object)doc);
            }
        }
    }

    private static enum Records {
        TOTAL,
        PAGES;

    }
}

