/*******************************************************************************
 * Copyright notice
 * 
 * This source code is copyright of Robert James Haynes - (c) 2010, 2011. All rights reserved.
 * 
 * Any redistribution, reproduction or decompilation of part or all of the code in any form is prohibited 
 * 
 * You may not, except with our express written permission, distribute or commercially exploit the content. Nor may you transmit it or store it in or display it on any website or other form of electronic retrieval system.
 ******************************************************************************/
/**
 *
	Identiza - Fuzzy matching Libraries
    
    Copyright (C) 2019  Robert James Haynes (EntityStream KFT), Budapest Hungary

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see https://www.gnu.org/licenses/agpl-3.0.en.html
 */
package com.entitystream.identiza.entity.resolve.utils;

import java.io.Serializable;
import java.util.logging.Logger;



public final class NYSIIS implements Serializable{

	private static Logger logger = Logger.getLogger("com.identiza");

	

	public static String encode( String originalWord ) {

		StringBuilder word  = null;
		if( originalWord != null &&
				originalWord.length() > 0 ) {
			word = new StringBuilder( originalWord.toUpperCase() );
		} else {
			return "";
		}
		char first;

		// strip any trailing S or Zs from words> 3
		if (word.length()>3)
			while(word.toString().endsWith("S") || word.toString().endsWith("Z")) {
				word.deleteCharAt( word.length() - 1 );
			}

		replaceFront( word, "MAC", "MC" );
		replaceFront( word, "PF",  "F" );
		replaceEnd(word,   "IX",  "IC" );
		replaceEnd(word,   "EX",  "EC" );

		replaceEnd(word,   "YE",  "Y" );
		replaceEnd(word,   "EE",  "Y" );
		replaceEnd(word,   "IE",  "Y" );
		replaceEnd(word,   "DT",  "D" );
		replaceEnd(word,   "RD",  "D" );


		replaceEnd(word,   "NT",  "N" );
		replaceEnd(word,   "ND",  "N" );
		// .EV => .EF
		replaceAll(word,   "EV", "EF", 1 );
		//rh added CI to SI for site vs cite
		replaceAll(word,   "CI", "SI");
		replaceAll(word,   "CY", "SI");

		if (word.length()>0) first = word.charAt(0);
		else first = ' ';


		// replace all vowels with 'A'
		// word = replaceAll(   word, "A",  "A" );
		replaceAll(word,   "E",  "A" );
		replaceAll(word,   "I",  "A" );
		replaceAll(word,   "O",  "A" );
		replaceAll(word,   "U",  "A" );

		// remove any 'W' that follows a vowel
		replaceAll(word,   "AW", "A" );

		replaceAll(word,   "GHT", "GT" );
		replaceAll(word,   "DG", "G" );
		replaceAll(word,   "PH", "F" );
		//rh added to solve dixon vs dickson
		replaceAll(word,   "ACKS", "AX");

		replaceAll(word,   "AH", "A", 1 );
		replaceAll(word,   "HA", "A", 1 );

		replaceAll(word,   "KN", "N" );
		replaceAll(word,   "K", "C" );

		replaceAll(word,   "M", "N", 1 );
		replaceAll(word,   "Q", "G", 1 );

		replaceAll(word,   "SH",  "S" );
		replaceAll(word,   "SCH", "S" );

		replaceAll(word,   "YW",  "Y" );

		replaceAll(word,   "Y",  "A", 1, word.length() - 2 );

		replaceAll(word,   "WR",  "R" );

		replaceAll(word,   "Z",  "S", 1 );

		replaceEnd(word,   "AY",  "Y" );

		while(word.toString().endsWith("A")) {
			word.deleteCharAt( word.length() - 1 );
		}

		word=reduceDuplicates(word);

		if(  'A' == first
				|| 'E' == first
				|| 'I' == first
				|| 'O' == first
				|| 'U' == first ) {
			if (word.length()>0) word.deleteCharAt(0);
			word.insert(0,first);
		}

		return word.toString();
	}

	/**
	 * Traverse the string reducing duplicated characters.
	 */
	private static StringBuilder reduceDuplicates(StringBuilder word) {
		char lastChar;
		StringBuilder newWord = new StringBuilder();

		if(0 == word.length()) {
			return word;
		}

		lastChar = word.charAt(0);
		newWord.append(lastChar);
		for(int i = 1; i < word.length(); ++i) {
			if(lastChar != word.charAt(i)) {
				newWord.append(word.charAt(i));
			}
			lastChar = word.charAt(i);
		}

		log("reduceDuplicates: " + word);

		return newWord;
	}

	/**
	 * Replace all occurances of the given pattern in the string to be encoded
	 * with the given replacement.
	 * @param word the original stringbuilder instance of the word
	 * @param find the sequence to locate
	 * @param repl the string to replace it with
	 */
	public static void replaceAll( StringBuilder word, String find, 
			String repl ) {
		replaceAll(word,find,repl,0,-1);
	}

	/**
	 * Replace all occurances of the given pattern in the string to be encoded
	 * with the given replacement, beginning at the given staring position.
	 * @param word the original stringbuilder instance of the word
	 * @param find the sequence to locate
	 * @param repl the string to replace it with
	 * @param startPos the position to begin at
	 */
	public static void replaceAll( StringBuilder word, String find, 
			String repl,
			int startPos ) {
		replaceAll(word,find,repl,startPos,-1);
	}

	/**
	 * Replace all occurances of the given pattern in the string to be encoded
	 * with the given replacement, beginning at the given staring position up to
	 * the given end position.
	 * @param word the original stringbuilder instance of the word
	 * @param find the sequence to locate
	 * @param repl the string to replace it with
	 * @param startPos the position to begin at
	 * @param endPos the position to stop at
	 */
	public static void replaceAll( StringBuilder word, String find, 
			String repl,
			int startPos,
			int endPos ) {
		int pos = word.toString().indexOf(find,startPos);

		/*
    log("Nysiis.replaceAll(): "
      + "pos: "      + pos      + " "
      + "word: "     + word     + " "
      + "find: "     + find     + " "
      + "repl: "     + repl     + " "
      + "startPos: " + startPos + " "
      + "endPos: "   + endPos   + " "
    );
		 */

		if(-1 == endPos) {
			endPos = word.length() - 1;
		}

		while(-1 != pos) {
			if(-1 != endPos && pos > endPos) {
				log("stopping pos > endPos: " + pos + ":" + endPos);
				break;
			}
			// log("word[" + word.length() + "]: " + word);
			// log("deleting at: " + pos + ", " + (find.length() - 1));

			word.delete( pos, pos + find.length() );
			// log("del[" + word.length() + "]:  " + word);

			if(pos<=word.length())
				word.insert( pos, repl );
			// log("ins[" + word.length() + "]:  " + word);

			pos = word.toString().indexOf(find);
			// log("new pos[" + word.length() + "]: " + pos);
			log("replaceAll[" + find + "," + repl + "]: " + word);
		}

	}

	/**
	 * If the encoded string begins with the given find string, replace it.
	 * @param word the original stringbuilder instance of the word
	 * @param find the prefix to test for
	 * @param repl the replacement to substitue
	 */
	private static void replaceFront( StringBuilder word, String find, 
			String repl ) {
		if(word.toString().startsWith(find)) {
			word.delete( 0, find.length() );
			word.insert( 0, repl );
			log("replaceFront[" + find + "]: " + word);
		}
	}

	/**
	 * If the encoded string ends with the given find string, replace it.
	 * @param word the original stringbuilder instance of the word
	 * @param find the suffix to test for
	 * @param repl the replacement to substitue
	 */
	private static void replaceEnd( StringBuilder word, String find, 
			String repl ) {
		if(word.toString().endsWith(find)) {
			word.delete( word.length() - find.length(), word.length() );
			word.append(repl);
			log("replaceEnd[" + find + "]: " + word);
		}
	}

	/**
	 * Logging statement controlled by the debug member.
	 * @param msg the message to optionaly log.
	 */
	private static void log( String msg ) {
		//logger.info(msg);
	}

	/**
	 * Check if the two strings encode to the same primary or alternate encodings
	 * using the Nysiis algorithm.
	 * @param s1
	 * @param s2
	 * @return true/false
	 */
	public static boolean isEncodeEqual( String s1, String s2 ) {
		return encode( s1 ).equals( encode( s2 ) );
	}

	public static String extendedEncode(String trim) {
		//remove all A's except for the first and last
		StringBuilder sb = new StringBuilder();
		for (int i=0; i<trim.length(); i++)
			if ("aeiouy".indexOf(trim.charAt(i))==-1 || i==0 || i==trim.length()-1)
				sb.append(trim.charAt(i));
		return encode(sb.toString());
	}

	public static String soundex(String s) {
	        char[] x = s.toUpperCase().toCharArray();
	         
	         
	        char firstLetter = x[0];
	 
	        //RULE [ 2 ]
	        //Convert letters to numeric code
	        for (int i = 0; i < x.length; i++) {
	            switch (x[i]) {
	            case 'B':
	            case 'F':
	            case 'P':
	            case 'V': {
	                x[i] = '1';
	                break;
	            }
	 
	            case 'C':
	            case 'G':
	            case 'J':
	            case 'K':
	            case 'Q':
	            case 'S':
	            case 'X':
	            case 'Z': {
	                x[i] = '2';
	                break;
	            }
	 
	            case 'D':
	            case 'T': {
	                x[i] = '3';
	                break;
	            }
	 
	            case 'L': {
	                x[i] = '4';
	                break;
	            }
	 
	            case 'M':
	            case 'N': {
	                x[i] = '5';
	                break;
	            }
	 
	            case 'R': {
	                x[i] = '6';
	                break;
	            }
	 
	            default: {
	                x[i] = '0';
	                break;
	            }
	            }
	        }
	 
	        //Remove duplicates
	        //RULE [ 1 ]
	        String output = "" + firstLetter;
	         
	        //RULE [ 3 ]
	        for (int i = 1; i < x.length; i++)
	            if (x[i] != x[i - 1] && x[i] != '0')
	                output += x[i];
	 
	        //RULE [ 4 ]
	        //Pad with 0's or truncate
	        output = output + "0000";
	        return output.substring(0, 4);
	}
}


