/*******************************************************************************
 * Copyright notice
 * 
 * This source code is copyright of Robert James Haynes - (c) 2010, 2011. All rights reserved.
 * 
 * Any redistribution, reproduction or decompilation of part or all of the code in any form is prohibited 
 * 
 * You may not, except with our express written permission, distribute or commercially exploit the content. Nor may you transmit it or store it in or display it on any website or other form of electronic retrieval system.
 ******************************************************************************/
/**
 *
	Identiza - Fuzzy matching Libraries
    
    Copyright (C) 2019  Robert James Haynes (EntityStream KFT), Budapest Hungary

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see https://www.gnu.org/licenses/agpl-3.0.en.html
 */
package com.entitystream.identiza.entity.resolve.utils;

import java.io.Serializable;
import java.util.ArrayList;
import com.entitystream.identiza.entity.resolve.processing.GradientGenerator;
import com.entitystream.identiza.wordlist.WordList;

public class EditDistance  implements Serializable{

	public static double getLevenshteinDistanceOfString (String s, String t) {
		if (s == null || t == null) {
			throw new IllegalArgumentException("Strings must not be null");
		}

		/*
	    The difference between this impl. and the previous is that, rather 
	     than creating and retaining a matrix of size s.length()+1 by t.length()+1, 
	     we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
	     is the 'current working' distance array that maintains the newest distance cost
	     counts as we iterate through the characters of String s.  Each time we increment
	     the index of String t we are comparing, d is copied to p, the second int[].  Doing so
	     allows us to retain the previous cost counts as required by the algorithm (taking 
	     the minimum of the cost count to the left, up one, and diagonally up and to the left
	     of the current cost count being calculated).  (Note that the arrays aren't really 
	     copied anymore, just switched...this is clearly much better than cloning an array 
	     or doing a System.arraycopy() each time  through the outer loop.)

	     Effectively, the difference between the two implementations is this one does not 
	     cause an out of memory condition when calculating the LD over two very large strings.  		
		 */		

		int n = s.length(); // length of s
		int m = t.length(); // length of t

		//double[] gradients = GradientGenerator.generate(Math.min(n,m), Math.max(n,m), 1, 1, 0,gradientType);

		if (n == 0) {
			return m;
		} else if (m == 0) {
			return n;
		}

		double p[] = new double[n+1]; //'previous' cost array, horizontally
		double d[] = new double[n+1]; // cost array, horizontally
		double _d[]; //placeholder to assist in swapping p and d

		// indexes into strings s and t
		int i; // iterates through s
		int j; // iterates through t

		char t_j; // jth character of t

		double cost; // cost

		for (i = 0; i<=n; i++) {
			p[i] = i;
		}

		for (j = 1; j<=m; j++) {
			t_j = t.charAt(j-1);
			d[0] = j;

			for (i=1; i<=n; i++) {
				cost = s.charAt(i-1)==t_j ? 0 : 1;
				// minimum of cell to the left+1, to the top+1, diagonally left and up +cost				
				d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);  
			}

			// copy current distance counts to 'previous row' distance counts
			_d = p;
			p = d;
			d = _d;
		} 

		// our last action in the above loop was to switch d and p, so p now 
		// actually has the most recent cost counts
		return p[n];
	}

	public static double getWeightedLevenshteinDistanceOfString (String s, String t, int gradientType) {
		if (s == null || t == null) {
			throw new IllegalArgumentException("Strings must not be null");
		}

		/*
	    The difference between this impl. and the previous is that, rather 
	     than creating and retaining a matrix of size s.length()+1 by t.length()+1, 
	     we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
	     is the 'current working' distance array that maintains the newest distance cost
	     counts as we iterate through the characters of String s.  Each time we increment
	     the index of String t we are comparing, d is copied to p, the second int[].  Doing so
	     allows us to retain the previous cost counts as required by the algorithm (taking 
	     the minimum of the cost count to the left, up one, and diagonally up and to the left
	     of the current cost count being calculated).  (Note that the arrays aren't really 
	     copied anymore, just switched...this is clearly much better than cloning an array 
	     or doing a System.arraycopy() each time  through the outer loop.)

	     Effectively, the difference between the two implementations is this one does not 
	     cause an out of memory condition when calculating the LD over two very large strings.  		
		 */		

		int n = s.length(); // length of s
		int m = t.length(); // length of t

		double[] gradients = GradientGenerator.generate(Math.min(n,m), Math.max(n,m), 1, 1, 0,gradientType);

		if (n == 0) {
			return m;
		} else if (m == 0) {
			return n;
		}

		double p[] = new double[n+1]; //'previous' cost array, horizontally
		double d[] = new double[n+1]; // cost array, horizontally
		double _d[]; //placeholder to assist in swapping p and d

		// indexes into strings s and t
		int i; // iterates through s
		int j; // iterates through t

		char t_j; // jth character of t

		double cost; // cost

		for (i = 0; i<=n; i++) {
			p[i] = i;
		}

		for (j = 1; j<=m; j++) {
			t_j = t.charAt(j-1);
			d[0] = j;

			for (i=1; i<=n; i++) {
				cost = convertCostByGradient(s.charAt(i-1)==t_j ? 0 : 1, i-1,j-1, gradients);
				// minimum of cell to the left+1, to the top+1, diagonally left and up +cost				
				d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);  
			}

			// copy current distance counts to 'previous row' distance counts
			_d = p;
			p = d;
			d = _d;
		} 

		// our last action in the above loop was to switch d and p, so p now 
		// actually has the most recent cost counts
		return p[n];
	}


	private static double getCost(String one, String two, WordList anon, WordList cv){
		
		if (one.equals(two))
			return 0.0;
		if (anon!=null){
			if (!anon.isUsefulForComparison(one,1) ||
					!anon.isUsefulForComparison(two,1))
			return 0.1;

		} else
		{
			if (!WordList.sisUsefulForComparison(one,1) &&
					!WordList.sisUsefulForComparison(two,1))
				return 1.0;
			if (WordList.sisUsefulForComparison(one,1) != WordList.sisUsefulForComparison(two,1))
				return 1.0;	
		}
		
		if (cv!=null)
			return cv.isStemmed(one, two);			
	
		if (one.startsWith(two) || two.startsWith(one))
			return 0.8;

			
		return getLevenshteinDistanceOfString(one,two)/Math.min(one.length(), two.length());	
		
	}

	public static double getHaynesEditDistanceOfArrayList(ArrayList<String> s, ArrayList<String> t, WordList anon, WordList cv, double[] gradients){
		return getHaynesEditDistanceOfArrayList(s, t, anon, cv, gradients, true);	
	}


	public static double getSimpleEditDistanceOfArrayList(
			ArrayList<String> s, ArrayList<String> t,
			WordList anon, WordList cv, double[] gradients ) {
		ArrayList<String> smallest;
		ArrayList<String> largest;
		if (s.size()>=t.size())
		{
			smallest=t;
			largest=s;
		}
		else
		{
			smallest=s;
			largest=t;	
		}

		double cost=0.0;		
		int lpos=-1;
		for (String lString : largest){
			int spos=-1;
			lpos++;
			double costWord=Double.MAX_VALUE;
			double gradientWord=0;
			for (String sString : smallest){
				spos++;
				double costTemp=getCost(lString, sString, anon, cv);
				if (costTemp<=costWord){
					costWord=costTemp;
					gradientWord=Math.max(gradients[spos], gradients[lpos]);
				} 
				
				if (costWord==0)
					break;
			}
			costWord=costWord*gradientWord;
			//System.out.println(lString + " cost = " + costWord);
			//if (weights!=null)
			//	cost+=weights.getCost(lString.toLowerCase(), rs.toLowerCase(), costWord);
			//else
				cost+=costWord;
		}
		//System.out.println(cost);
		return cost;
	}
	
	public static double getHaynesEditDistanceOfArrayList(ArrayList<String> s, ArrayList<String> t, WordList anon, WordList cv, double[] gradients, boolean asContent){
		//general principle:
		//each word in the longer of the two phrases is compared with the other string
		//if the word exists in a similar position in relation to its predecessor (ie prevous word)
		//then its given a 0 score,
		//if its out of place it is given a half score for the position it should have appeared in 
		//if its non existant it will be given the score of its position
		ArrayList<String> smallest;
		ArrayList<String> largest;
		if (s.size()>=t.size())
		{
			if (!asContent){
				smallest=t;
				largest=s;
			} else
			{
				smallest=s;
				largest=t;						
			}
		}
		else
		{
			if (!asContent){
				smallest=s;
				largest=t;	
			} else {
				smallest=t;
				largest=s;
			}
		}
		//System.out.println("pos\toutter\tinner\tlastGM\t2use\tcost\tposWeight");
		double cost=0.0;		
		int lpos=-1;
		ArrayList<Integer> lastGoodMatch=new ArrayList<Integer>();				
		for (String lString : largest){
			int spos=-1;
			lpos++;
			double costWord=1000000.0;
			ArrayList<Integer> bestMatchPos=new ArrayList<Integer>();
			String rs="";
			for (String sString : smallest){
				spos++;
				double temp=0;
				temp = getCost(lString, sString, anon, cv)*gradients[lpos];				
				//evaluate the score based on the last good match
				//spos represents this match
				//there must be at least 1 difference....
				if (temp<gradients[lpos]){ //was a positive score
					boolean clearAfter=false;
					if (lastGoodMatch.size()==0){
						lastGoodMatch.add(-1);
						clearAfter=true;
					}
					double posWeight=1000000.0;
					for (Integer lastGoodMatchToUse : lastGoodMatch){
						if (lastGoodMatchToUse==-1)
							lastGoodMatchToUse=spos-1;
						double posWTemp=(1.0*Math.abs(spos-lastGoodMatchToUse)-1.0)/(1.0*Math.max(s.size(), t.size()));
						posWeight=Math.min(posWeight, posWTemp);
						//System.out.println(spos+"\t"+lString + "\t"+ sString + "\t" + lastGoodMatch + "\t" + lastGoodMatchToUse + "\t" + myFormat.format(temp) + "\t"+posWTemp);
						
					}
					temp=temp+posWeight;
					if (clearAfter)
					   lastGoodMatch.remove(0);
					if (temp<=costWord){
						bestMatchPos.add(spos);
						costWord=temp;
						
					}
				} //else System.out.println(spos+"\t"+lString + "\t"+ sString + "\t" + lastGoodMatch + "\t\t" + myFormat.format(temp) + "\t0");

				

			} 
			lastGoodMatch=bestMatchPos;
			//if (weights!=null)
			//	cost+=weights.getCost(lString.toLowerCase(), rs.toLowerCase(), costWord);
			//else
				cost+=costWord;
		}
		return cost;
	}
	public static double getLevenshteinDistanceOfStringArrayList (ArrayList<String> s, ArrayList<String> t, WordList anon, WordList cv, double[] gradients) {
		if (s.size()>t.size())
		   return getLevenshteinDistanceOfStringArrayList(s, t, anon, cv, gradients, false);
		else 
			return getLevenshteinDistanceOfStringArrayList(t, s, anon, cv, gradients, false);
	}

	public static double getLevenshteinDistanceOfStringArrayList (ArrayList<String> s, ArrayList<String> t, WordList anon, WordList cv, double[] gradients, boolean asContent) {
		if (s == null || t == null) {
			throw new IllegalArgumentException("ArrayLists must not be null");
		}

		/*
	    The difference between this impl. and the previous is that, rather 
	     than creating and retaining a matrix of size s.length()+1 by t.length()+1, 
	     we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
	     is the 'current working' distance array that maintains the newest distance cost
	     counts as we iterate through the characters of String s.  Each time we increment
	     the index of String t we are comparing, d is copied to p, the second int[].  Doing so
	     allows us to retain the previous cost counts as required by the algorithm (taking 
	     the minimum of the cost count to the left, up one, and diagonally up and to the left
	     of the current cost count being calculated).  (Note that the arrays aren't really 
	     copied anymore, just switched...this is clearly much better than cloning an array 
	     or doing a System.arraycopy() each time  through the outer loop.)

	     Effectively, the difference between the two implementations is this one does not 
	     cause an out of memory condition when calculating the LD over two very large strings.  		
		 */		

		int n = s.size(); // length of s
		int m = t.size(); // length of t

		double p[] = new double[n+1]; //'previous' cost array, horizontally
		double d[] = new double[n+1]; // cost array, horizontally
		double _d[]; //placeholder to assist in swapping p and d


		if (n == 0) {
			return m;
		} else if (m == 0) {
			return n;
		}

		// indexes into strings s and t
		int i; // iterates through s
		int j; // iterates through t

		String t_j; // jth character of t

		double cost=0; // cost

		for (i = 0; i<=n; i++) {
			p[i] = i;
		}

		for (j = 1; j<=n; j++) {
			//			t_j = t.keyAt(j-1);		
			d[0] = j;
			if (t.size()>j-1){
				t_j=t.get(j-1);
				for (i=1; i<=n; i++) {

					cost = (s.get(i-1).equalsIgnoreCase(t_j) ? 0:1);
					// minimum of cell to the left+1, to the top+1, diagonally left and up +cost				
					d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost); 					
				}

				// copy current distance counts to 'previous row' distance counts
				_d = p;
				p = d;
				d = _d;
			}
		} 
		// our last action in the above loop was to switch d and p, so p now 
		// actually has the most recent cost counts
		return cost;
	}


	private static double convertCostByGradient(double cost, int pos1, int pos2, double[] gradient) {
		//calculate the gradient to be applied to the words based on position
		return cost*(Math.max(gradient[Math.min(pos1, gradient.length-1)], gradient[Math.min(pos2, gradient.length-1)]));
	}

	public static int getDemerauLevenshteinDistanceOfMatch (int[] match) {
		if (match == null) {
			throw new IllegalArgumentException("Match array must not be null");
		}

		int n = match.length; // length of s
		//int m = t.length(); // length of t

		if (n == 0) {
			return 0;
		}

		int p[] = new int[n+1]; //'previous' cost array, horizontally
		int d[] = new int[n+1]; // cost array, horizontally
		int _d[]; //placeholder to assist in swapping p and d

		// indexes into strings s and t
		int i; // iterates through s
		int j; // iterates through t

		int cost; // cost

		for (i = 0; i<=n; i++) {
			p[i] = i;
		}

		for (j = 1; j<=n; j++) {
			//			t_j = t.keyAt(j-1);		
			d[0] = j;

			for (i=1; i<=n; i++) {

				cost = (match[i-1]>0 ? 0:1);
				// minimum of cell to the left+1, to the top+1, diagonally left and up +cost				
				d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost); 					
			}

			// copy current distance counts to 'previous row' distance counts
			_d = p;
			p = d;
			d = _d;
		} 
		// our last action in the above loop was to switch d and p, so p now 
		// actually has the most recent cost counts
		return p[n];
	}

	public static int niceParseInt(String string) {

		try {
			return Integer.parseInt(string);
		} catch (Exception e){
			String newString = "";
			for (int i=0; i<string.length(); i++)
				if ("0123456789".contains(""+string.charAt(i)))
					newString = newString+string.charAt(i);
			try {
				return Integer.parseInt(newString);
			} catch (Exception ee){
				return -1;
			}
		}
	}

	public static int findToken(String[] words, String[] tokens){
		int wordpos=-1;
		for (String word: words){
			wordpos++;
			for (String token: tokens){
				if (token.equalsIgnoreCase(word)){
					return wordpos;
				}
			}
		}	
		return -1;
	}


	public static void main (String[] args){
		ArrayList<String> s = new ArrayList<String>();
		ArrayList<String> t = new ArrayList<String>();
		s.add("SUE");
		s.add("LUE");
		t.add("SU");
		t.add("LU");
		
		double[] gradients= GradientGenerator.generate(2, 2, 2, 0, 1, GradientGenerator.LINEAR);
		double e= EditDistance.getSimpleEditDistanceOfArrayList(s, t, null, null, gradients);
		System.out.println(e);
	}



}
