/*
 * Decompiled with CFR 0.152.
 */
package edu.umd.cloud9.collection;

import edu.umd.cloud9.collection.Indexable;
import edu.umd.cloud9.collection.line.TextDocument;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.TrecExtractLinks;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class ExtractHTMLFieldCollection
extends PowerTool {
    private static final Logger LOG = Logger.getLogger(ExtractHTMLFieldCollection.class);
    public static final String[] RequiredParameters = new String[]{"Cloud9.InputPath", "Cloud9.InputFormat", "Cloud9.OutputPath", "Cloud9.TargetTag"};

    @Override
    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public ExtractHTMLFieldCollection(Configuration conf) {
        super(conf);
    }

    @Override
    public int runTool() throws Exception {
        Configuration conf = this.getConf();
        Job job = new Job(conf);
        String inputPath = conf.get("Cloud9.InputPath");
        String inputFormat = conf.get("Cloud9.InputFormat");
        String outputPath = conf.get("Cloud9.OutputPath");
        String tag = conf.get("Cloud9.TargetTag");
        job.setJobName("ExtractFieldCollection");
        job.setJarByClass(ExtractHTMLFieldCollection.class);
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(Reducer.class);
        job.setNumReduceTasks(200);
        job.setInputFormatClass(Class.forName(inputFormat));
        ExtractHTMLFieldCollection.recursivelyAddInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath((Job)job, (Path)new Path(outputPath));
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput((Job)job, (boolean)true);
        SequenceFileOutputFormat.setOutputCompressionType((Job)job, (SequenceFile.CompressionType)SequenceFile.CompressionType.BLOCK);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(TextDocument.class);
        LOG.info((Object)("ExtractFieldCollection - " + tag));
        LOG.info((Object)(" - Input path: " + inputPath));
        LOG.info((Object)(" - Input format: " + inputFormat));
        LOG.info((Object)(" - Output path: " + outputPath));
        LOG.info((Object)(" - Target tag: " + tag));
        job.waitForCompletion(true);
        return 0;
    }

    public static void recursivelyAddInputPaths(Job job, String path) throws IOException {
        FileStatus[] ls;
        FileSystem fs;
        try {
            fs = FileSystem.get((URI)new URI(path), (Configuration)job.getConfiguration());
        }
        catch (URISyntaxException e) {
            throw new RuntimeException("Error recursively adding path -- " + path);
        }
        for (FileStatus status : ls = fs.listStatus(new Path(path))) {
            if (status.getPath().getName().startsWith("_")) continue;
            if (status.isDir()) {
                ExtractHTMLFieldCollection.recursivelyAddInputPaths(job, status.getPath().toString());
                continue;
            }
            FileInputFormat.addInputPath((Job)job, (Path)status.getPath());
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        if (args.length != 4) {
            System.err.println("Usage: ExtractFieldCollection [input-path] [input-format] [output-path] [target-tag]");
            System.exit(-1);
        }
        conf.set("Cloud9.InputPath", args[0]);
        conf.set("Cloud9.InputFormat", args[1]);
        conf.set("Cloud9.OutputPath", args[2]);
        conf.set("Cloud9.TargetTag", args[3]);
        int res = ToolRunner.run((Configuration)conf, (Tool)new ExtractHTMLFieldCollection(conf), (String[])args);
        System.exit(res);
    }

    public static class MyMapper
    extends Mapper<LongWritable, Indexable, LongWritable, TextDocument> {
        private static String tag;
        private static final Parser parser;
        private static NodeFilter filter;
        private static final LongWritable myKey;
        private static final TextDocument myValue;
        private static final StringBuffer strBuf;

        public void setup(Mapper.Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            tag = conf.get("Cloud9.TargetTag");
            filter = tag.equalsIgnoreCase("heading") ? new HeadingTagFilter() : new TagNameFilter(tag);
        }

        public void map(LongWritable key, Indexable doc, Mapper.Context context) throws IOException, InterruptedException {
            NodeList nl;
            context.getCounter((Enum)TrecExtractLinks.Map.LinkCounter.INPUT_DOCS).increment(1L);
            if (doc.getDocid() == null || doc.getContent() == null) {
                return;
            }
            myKey.set(key.get());
            try {
                parser.setInputHTML(doc.getContent());
                nl = parser.parse(filter);
            }
            catch (ParserException e) {
                context.getCounter((Enum)TrecExtractLinks.Map.LinkCounter.PARSER_FAILED).increment(1L);
                myValue.setDocid(doc.getDocid());
                myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
                context.write((Object)myKey, (Object)myValue);
                return;
            }
            catch (StackOverflowError e) {
                context.getCounter((Enum)TrecExtractLinks.Map.LinkCounter.PARSER_FAILED).increment(1L);
                myValue.setDocid(doc.getDocid());
                myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
                context.write((Object)myKey, (Object)myValue);
                return;
            }
            strBuf.setLength(0);
            strBuf.append("<DOC>\n<DOCNO>");
            strBuf.append(doc.getDocid());
            strBuf.append("</DOCNO>\n");
            for (int i = 0; i < nl.size(); ++i) {
                strBuf.append(nl.elementAt(i).toHtml()).append("\n");
            }
            strBuf.append("</DOC>\n");
            myValue.setDocid(doc.getDocid());
            myValue.setContent(strBuf.toString());
            context.write((Object)myKey, (Object)myValue);
            context.getCounter((Enum)TrecExtractLinks.Map.LinkCounter.OUTPUT_DOCS).increment(1L);
        }

        static {
            parser = new Parser();
            myKey = new LongWritable();
            myValue = new TextDocument();
            strBuf = new StringBuffer();
        }

        public static class HeadingTagFilter
        implements NodeFilter {
            private static final long serialVersionUID = 3848416345122090905L;
            private final Pattern pattern = Pattern.compile("h[123456]", 2);

            public boolean accept(Node node) {
                return this.pattern.matcher(node.getText()).matches();
            }
        }
    }
}

