/*
 * Decompiled with CFR 0.152.
 */
package edu.umd.cloud9.webgraph;

import edu.umd.cloud9.collection.clue.ClueWarcDocnoMapping;
import edu.umd.cloud9.collection.clue.ClueWarcRecord;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import edu.umd.cloud9.webgraph.normalizer.AnchorTextNormalizer;
import java.io.IOException;
import java.io.UTFDataFormatException;
import java.net.URI;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class ClueExtractLinks
extends PowerTool {
    private static final Logger LOG = Logger.getLogger(ClueExtractLinks.class);
    public static final String[] RequiredParameters = new String[]{"Cloud9.InputPath", "Cloud9.OutputPath", "Cloud9.Mappers", "Cloud9.Reducers", "Cloud9.DocnoMappingFile", "Cloud9.IncludeInternalLinks", "Cloud9.AnchorTextNormalizer"};

    @Override
    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public ClueExtractLinks(Configuration conf) {
        super(conf);
    }

    @Override
    public int runTool() throws Exception {
        JobConf conf = new JobConf(this.getConf(), ClueExtractLinks.class);
        FileSystem fs = FileSystem.get((Configuration)conf);
        int numMappers = conf.getInt("Cloud9.Mappers", 1);
        int numReducers = conf.getInt("Cloud9.Reducers", 200);
        String inputPath = conf.get("Cloud9.InputPath");
        String outputPath = conf.get("Cloud9.OutputPath");
        String mappingFile = conf.get("Cloud9.DocnoMappingFile");
        if (!fs.exists(new Path(mappingFile))) {
            throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");
        }
        DistributedCache.addCacheFile((URI)new URI(mappingFile), (Configuration)conf);
        conf.setJobName("ClueExtractLinks");
        conf.set("mapred.child.java.opts", "-Xmx2048m");
        conf.setInt("mapred.task.timeout", 60000000);
        conf.set("mapreduce.map.memory.mb", "2048");
        conf.set("mapreduce.map.java.opts", "-Xmx2048m");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
        conf.set("mapreduce.task.timeout", "60000000");
        conf.setNumMapTasks(numMappers);
        conf.setNumReduceTasks(numReducers);
        conf.setMapperClass(Map.class);
        conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(ArrayListWritable.class);
        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput((JobConf)conf, (boolean)true);
        SequenceFileOutputFormat.setOutputCompressionType((JobConf)conf, (SequenceFile.CompressionType)SequenceFile.CompressionType.BLOCK);
        SequenceFileInputFormat.setInputPaths((JobConf)conf, (String)inputPath);
        FileOutputFormat.setOutputPath((JobConf)conf, (Path)new Path(outputPath));
        LOG.info((Object)"ClueExtractLinks");
        LOG.info((Object)(" - input path: " + inputPath));
        LOG.info((Object)(" - output path: " + outputPath));
        LOG.info((Object)(" - mapping file: " + mappingFile));
        LOG.info((Object)(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false)));
        if (!fs.exists(new Path(outputPath))) {
            JobClient.runJob((JobConf)conf);
        } else {
            LOG.info((Object)(outputPath + " already exists! Skipping this step..."));
        }
        return 0;
    }

    public static class Reduce
    extends MapReduceBase
    implements Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>> {
        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable();
        private static ArrayListWritable<AnchorText> packet;
        private static boolean pushed;

        public void reduce(Text key, Iterator<ArrayListWritable<AnchorText>> values, OutputCollector<Text, ArrayListWritable<AnchorText>> output, Reporter reporter) throws IOException {
            arrayList.clear();
            while (values.hasNext()) {
                packet = values.next();
                for (AnchorText data : packet) {
                    pushed = false;
                    for (int i = 0; i < arrayList.size(); ++i) {
                        if (!((AnchorText)arrayList.get(i)).equalsIgnoreSources(data)) continue;
                        ((AnchorText)arrayList.get(i)).addDocumentsFrom(data);
                        pushed = true;
                        break;
                    }
                    if (pushed) continue;
                    arrayList.add(data.clone());
                }
            }
            output.collect((Object)key, arrayList);
        }
    }

    public static class Map
    extends MapReduceBase
    implements Mapper<IntWritable, ClueWarcRecord, Text, ArrayListWritable<AnchorText>> {
        private static String base;
        private static String baseHost;
        private static int docno;
        private static final Text keyWord;
        private static final ArrayListWritable<AnchorText> arrayList;
        private static final ClueWarcDocnoMapping docnoMapping;
        private static final Parser parser;
        private static final NodeFilter filter;
        private static NodeList list;
        private static boolean includeInternalLinks;
        private static AnchorTextNormalizer normalizer;

        public void configure(JobConf job) {
            Path[] localFiles;
            try {
                localFiles = DistributedCache.getLocalCacheFiles((Configuration)job);
            }
            catch (IOException e) {
                throw new RuntimeException("Local cache files not read properly.");
            }
            try {
                docnoMapping.loadMapping(localFiles[0], (FileSystem)FileSystem.getLocal((Configuration)job));
            }
            catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException("Error initializing DocnoMapping!");
            }
            includeInternalLinks = job.getBoolean("Cloud9.IncludeInternalLinks", false);
            try {
                normalizer = (AnchorTextNormalizer)Class.forName(job.get("Cloud9.AnchorTextNormalizer")).newInstance();
            }
            catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException("Error initializing AnchorTextNormalizer");
            }
        }

        public void map(IntWritable key, ClueWarcRecord doc, OutputCollector<Text, ArrayListWritable<AnchorText>> output, Reporter reporter) throws IOException {
            reporter.incrCounter((Enum)LinkCounter.INPUT_DOCS, 1L);
            try {
                docno = docnoMapping.getDocno(doc.getHeaderMetadataItem("WARC-TREC-ID"));
            }
            catch (NullPointerException e) {
                reporter.incrCounter((Enum)LinkCounter.INVALID_DOCNO, 1L);
                return;
            }
            try {
                base = doc.getHeaderMetadataItem("WARC-Target-URI");
            }
            catch (NullPointerException e) {
                reporter.incrCounter((Enum)LinkCounter.INVALID_URL, 1L);
                return;
            }
            if (base == null) {
                reporter.incrCounter((Enum)LinkCounter.INVALID_URL, 1L);
                return;
            }
            arrayList.clear();
            arrayList.add(new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, null, docno));
            keyWord.set(base);
            output.collect((Object)keyWord, arrayList);
            arrayList.clear();
            reporter.incrCounter((Enum)LinkCounter.OUTPUT_DOCS, 1L);
            try {
                baseHost = new URI(base).getHost();
            }
            catch (Exception e) {
                reporter.incrCounter((Enum)LinkCounter.INVALID_URL, 1L);
                return;
            }
            if (baseHost == null) {
                reporter.incrCounter((Enum)LinkCounter.INVALID_URL, 1L);
                return;
            }
            try {
                parser.setInputHTML(doc.getContent());
                NodeList nl = parser.parse(null);
                BaseHrefTag baseTag = new BaseHrefTag();
                baseTag.setBaseUrl(base);
                nl.add((Node)baseTag);
                parser.setInputHTML(nl.toHtml());
                list = parser.extractAllNodesThatMatch(filter);
            }
            catch (ParserException e) {
                reporter.incrCounter((Enum)LinkCounter.PARSER_FAILED, 1L);
                return;
            }
            catch (StackOverflowError e) {
                reporter.incrCounter((Enum)LinkCounter.PARSER_FAILED, 1L);
                return;
            }
            for (int i = 0; i < list.size(); ++i) {
                LinkTag link = (LinkTag)list.elementAt(i);
                String anchor = link.getLinkText();
                String url = link.extractLink();
                if (url == null || url.equals(base)) continue;
                String host = null;
                try {
                    host = new URI(url).getHost();
                }
                catch (Exception e) {
                    continue;
                }
                if (host == null) continue;
                if (anchor == null) {
                    anchor = "";
                }
                anchor = normalizer.process(anchor);
                arrayList.clear();
                if (baseHost.equals(host)) {
                    if (!includeInternalLinks) continue;
                    arrayList.add(new AnchorText(AnchorTextConstants.Type.INTERNAL_IN_LINK.val, anchor, docno));
                } else {
                    arrayList.add(new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, anchor, docno));
                }
                try {
                    keyWord.set(url);
                    output.collect((Object)keyWord, arrayList);
                    continue;
                }
                catch (UTFDataFormatException e) {
                    reporter.incrCounter((Enum)LinkCounter.TEXT_TOO_LONG, 1L);
                    keyWord.set(url);
                    byte flag = ((AnchorText)arrayList.get(0)).getType();
                    arrayList.clear();
                    arrayList.add(new AnchorText(flag, "", docno));
                    output.collect((Object)keyWord, arrayList);
                }
            }
        }

        static {
            keyWord = new Text();
            arrayList = new ArrayListWritable();
            docnoMapping = new ClueWarcDocnoMapping();
            parser = new Parser();
            filter = new NodeClassFilter(LinkTag.class);
        }

        public static enum LinkCounter {
            INPUT_DOCS,
            OUTPUT_DOCS,
            INVALID_DOCNO,
            INVALID_URL,
            TEXT_TOO_LONG,
            PARSER_FAILED;

        }
    }
}

