package it.unimi.dsi.mg4j.tool;

/*		 
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2011 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceOpenHashMap;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.mg4j.index.BitStreamIndex;
import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;
import it.unimi.dsi.mg4j.index.CompressionFlags;
import it.unimi.dsi.mg4j.index.DiskBasedIndex;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexReader;
import it.unimi.dsi.mg4j.index.IndexWriter;
import it.unimi.dsi.mg4j.index.VariableQuantumIndexWriter;
import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.mg4j.search.AlignDocumentIterator;
import it.unimi.dsi.util.Properties;

import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;

/** Precomputes alignments of two parallel texts.
 *  
 * @author Sebastiano Vigna
 * @since 4.0
 */

public class PrecomputeAlignments {
	private static final Logger LOGGER = Util.getLogger( PrecomputeAlignments.class );
	
	/** The overall number of documents. */
	protected final int numberOfDocuments;
	/** The output basename. */
	protected final String outputBasename;
	/** The logging interval. */
	private final long logInterval;
	/** The index writer for the merged index. */ 
	protected IndexWriter indexWriter;
	/** A copy of {@link #indexWriter} which is non-<code>null</code> if {@link #indexWriter} is an instance of {@link VariableQuantumIndexWriter}. */ 
	protected VariableQuantumIndexWriter variableQuantumIndexWriter;
	private final BitStreamIndex alignerIndex;
	private final BitStreamIndex aligneeIndex;
	private final Collection<? extends CharSequence> terms;
	private int numberOfTerms;

	private boolean subset;
	
	/** Precomputes the alignments of two indices.
	 * 
	 * @param outputBasename the basename of the combined index.
	 * @param aligneeBasename the basename of the index to be aligned.
	 * @param alignerBasename the basename of the index with which to align.
	 * @param terms a sorted list of terms whose alignments will be precomputed, or <code>null</code> for all terms. 
	 * @param logInterval how often we log.
	 */
	public PrecomputeAlignments( final String outputBasename,
			String aligneeBasename,
			String alignerBasename,
			ObjectList<? extends CharSequence> terms, final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {

		this.logInterval = logInterval;
		this.subset = terms != null;
		
		aligneeIndex = (BitStreamIndex)Index.getInstance( aligneeBasename, true, false, subset );
		alignerIndex = (BitStreamIndex)Index.getInstance( alignerBasename, true, false, false );
		if ( aligneeIndex.numberOfDocuments != alignerIndex.numberOfDocuments ) LOGGER.warn( "Alignee and aligner do not have the same number of documents" );
		// We need a fake document so that lists are never empty.
		numberOfDocuments = alignerIndex.numberOfTerms + 1;
		
		this.terms = terms;
		numberOfTerms = subset ? terms.size() : aligneeIndex.numberOfTerms;
		
		this.outputBasename = outputBasename;

		LOGGER.debug( "Precomputing alignment of " + aligneeBasename + " with " + alignerBasename + " into " + outputBasename );

		Map<Component, Coding> writerFlags = new Reference2ReferenceOpenHashMap<CompressionFlags.Component, CompressionFlags.Coding>( CompressionFlags.DEFAULT_STANDARD_INDEX );
		writerFlags.remove( CompressionFlags.Component.COUNTS );
		writerFlags.remove( CompressionFlags.Component.POSITIONS );
		indexWriter = new BitStreamIndexWriter( outputBasename, numberOfDocuments, true, writerFlags );
	}
	
	
	public void run() throws IOException, ConfigurationException {
		final Logger logger = Util.getLogger( this.getClass() );
		final ProgressLogger pl = new ProgressLogger( logger, logInterval );
		pl.displayFreeMemory = true;

		// To write the frequency of each term
		final OutputBitStream frequencies = new OutputBitStream( outputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
		final PrintWriter termFile = subset ? new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ) ) : null;
		
		pl.expectedUpdates = numberOfTerms;
		pl.itemsName = "terms";
		pl.logInterval = logInterval;
		pl.start( "Precomputing alignments..." );

		IntArrayList postings = new IntArrayList();
		final IndexReader aligneeIndexReader = aligneeIndex.getReader();
		Iterator<? extends CharSequence> termIterator = subset ? terms.iterator() : null;
		final MutableString term = new MutableString();		

		for( int t = 0, u; t < numberOfTerms; t++ ) {
			if ( subset ) {
				term.replace( termIterator.next() );
				if ( ( u = (int)aligneeIndex.termMap.getLong( term ) ) == -1 ) LOGGER.warn( "Term " + term + " is not part of the index to be aligned" );
				termFile.println( term );
			}
			else u = t;

			indexWriter.newInvertedList();
			postings.clear();			

			if ( u != -1 ) {			
				final IndexReader alignerIndexReader = alignerIndex.getReader();
					
				for( int a = 0; a < numberOfDocuments - 1; a++ ) {
					if ( AlignDocumentIterator.getInstance( 
						aligneeIndexReader.documents( u ), 
						alignerIndexReader.nextIterator() 
					).hasNext() ) postings.add( a );
				}
				alignerIndexReader.close();
			}

			// A fake posting so that no list is empty.
			postings.add( numberOfDocuments - 1 );

			indexWriter.writeFrequency( postings.size() );
			frequencies.writeGamma( postings.size() );
			for( int p: postings ) indexWriter.writeDocumentPointer( indexWriter.newDocumentRecord(), p );

			pl.update();			
		}
		
		pl.done();

		indexWriter.close();
		if ( subset ) termFile.close();
		Properties properties = indexWriter.properties();
		properties.addProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( alignerIndex.termProcessor ) );
		properties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
		frequencies.close();
	}

	public static void main( final String[] arg ) throws JSAPException, ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
		
		SimpleJSAP jsap = new SimpleJSAP( PrecomputeAlignments.class.getName(), "Precomputes alignments between two indices.",
				new Parameter[] {
				new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
				new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the resulting index." ),
				new UnflaggedOption( "aligneeBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the index who has to be aligned." ),
				new UnflaggedOption( "alignerBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the index used to align." ),
				new UnflaggedOption( "terms", JSAP.STRING_PARSER, JSAP.NOT_REQUIRED, "A file containing a selected subset of UTF-8 coded words on which the alignments must be computed. The terms must be in sorted order, and appear in the same form as they appear in the file of terms of the alignee (i.e., no term processing will be applied)." ),
		});
		
		JSAPResult jsapResult = jsap.parse( arg );
		if ( jsap.messagePrinted() ) return;

		final ObjectList<MutableString> terms = jsapResult.userSpecified( "terms" ) ? new FileLinesCollection( jsapResult.getString( "terms" ), "UTF-8" ).allLines() : null;
		if ( terms != null ) Collections.sort( terms );
		
		new PrecomputeAlignments( jsapResult.getString( "outputBasename" ), jsapResult.getString( "aligneeBasename" ), jsapResult.getString( "alignerBasename" ), terms, jsapResult.getLong( "logInterval" ) ).run(); 
	}
}
