/*******************************************************************************
 * Copyright notice
 * 
 * This source code is copyright of Robert James Haynes - (c) 2010, 2011. All rights reserved.
 * 
 * Any redistribution, reproduction or decompilation of part or all of the code in any form is prohibited 
 * 
 * You may not, except with our express written permission, distribute or commercially exploit the content. Nor may you transmit it or store it in or display it on any website or other form of electronic retrieval system.
 ******************************************************************************/
/**
 *
	Identiza - Fuzzy matching Libraries
    
    Copyright (C) 2019  Robert James Haynes (EntityStream KFT), Budapest Hungary

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see https://www.gnu.org/licenses/agpl-3.0.en.html
 */
package com.entitystream.identiza.wordlist;

import java.io.File;
import java.io.Serializable;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;

import com.entitystream.identiza.entity.resolve.processing.IdentizaException;

import com.entitystream.identiza.entity.resolve.utils.NYSIIS;

public class WordList implements WordListInterface, Serializable{
	/**
	 * 
	 */
	private static final long serialVersionUID = 4653120858149359117L;
	public static int WORDSWITHNUMBERS=0;
	public static int ACRONYMS=1;
	public static int VOCABULARY=2;
	public static int PHONETIC=3;
	public static int NAMES=4;
	public static int ANONYMOUS=5;
	public static int MATCH=6;
	protected static int listType;
	private HashMap<String, WordObject> list;

	protected String fileLoc=null;
	protected char prefix = 'V';

	private boolean isPhonetic=false;
	public WordList(int _listType){
		setListType(_listType);
		if (_listType==WORDSWITHNUMBERS)
			prefix = 'O';
		if (_listType==ACRONYMS)
			prefix = 'A';
		if (_listType==VOCABULARY)
			prefix = 'V';
		if (_listType==PHONETIC)
			prefix = 'P';
		if (_listType==NAMES)
			prefix = 'N';
		if (_listType==ANONYMOUS)
			prefix = 'Z';
		list = new HashMap<String, WordObject>();
		
	
	}
	public boolean isPhonetic() {
		return isPhonetic;
	}
	public void setPhonetic(boolean isPhonetic) {
		this.isPhonetic = isPhonetic;
	}
	public String addWord(String word, String parentString, boolean same, String type, boolean custom){
		//find first
		word = Singularize(word).toUpperCase().trim();	
		String key = generateKey(word);
		WordObject wordo = getWord(word);
		
		if (wordo == null) 
			wordo = new WordObject(word, key, type);
		wordo.setCustom(custom);
		wordo.addParent(parentString, same); //appends

		list.put(key, wordo);
		//System.out.println(key + ":" + parentString.toString());
		return key;
	}

	public Map<String, WordObject> getList(){
		return list;
	}

	public String addWord(String word, String type, boolean custom){
		//find first
		word = Singularize(word).toUpperCase().trim();
		String key = generateKey(word);
		WordObject wordo = getWord(key);
		if (wordo == null) 
			wordo = new WordObject(word, key, type);		
		list.put(key, wordo);
		//System.out.println(key);
		return key;
	}

	public double isStemmed(String word1, String word2){
		String key1 = generateKey(word1);
		String key2 = generateKey(word2);
		//System.out.println(word1 +":"+ key1 +","+ word2 +":"+ key2);
		if (key1==null || key2==null) return 1.0;
		WordObject wordo1 = getWord(key1);
		WordObject wordo2 = getWord(key2);
		//System.out.println(wordo1 + ":"+ wordo2);
		if (wordo1==null || wordo2==null) return 1.0;
		ArrayList<String> parent1 = wordo1.getSameParents();
		ArrayList<String> parent2 = wordo2.getSameParents();
		//this doesnt work!!!!

		if (parent1.contains(key2) || parent2.contains(key1))
			return 0.1;
		//check harder
		for (String ky : parent1){

			//System.out.println(ky.toString()+ " equals? " + key2.toString());
			if (ky.toString().equals(key2.toString()))
				return 0.15;
			for (String ky2 : parent2){
				//System.out.println(ky.toString()+ " equals? " + ky2.toString());
				if (ky.toString().equals(ky2.toString()))
					return 0.2;
			}
		}
		for (String ky : parent2){
			//System.out.println(ky.toString()+ " equals? " + key1.toString());
			if (ky.toString().equals(key1.toString()))
				return 0.15;
			for (String ky2 : parent1){
				//System.out.println(ky.toString()+ " equals? " + ky2.toString());
				if (ky.toString().equals(ky2.toString()))
					return 0.2;
			}
		}
		return 1;
	}

	public String getStem(String word1){
		String key1 = generateKey(word1);
		//System.out.println(word1 +":"+ key1 +","+ word2 +":"+ key2);
		if (key1==null) return word1;
		WordObject wordo1 = getWord(key1);
		if (wordo1==null) return word1;
		ArrayList<String> parent1 = wordo1.getSameParents();
		if (parent1==null || parent1.size()==0)
			return word1;
		return parent1.get(0);
	}

	
	public WordObject getWord(String word){
		if (word != null)
			return list.get(generateKey(word));
		else
			return null;
	}

	public boolean inlist(String word){
		word = Singularize(word).toUpperCase().trim();
		String key = generateKey(word);
		return list.containsKey(key);		
	}

	public boolean isUseful(String word){
		word = Singularize(word).toUpperCase().trim();
		if (word.length()<2) return false; //removes file extensions
		if (word.contains("@")) return false; //suspected email
		if (word.contains("/")) return false; //suspected path
		if (word.contains("\\")) return false; //suspected path
		if (word.contains(""+word.charAt(0) + word.charAt(0) + word.charAt(0))) return false; //all same chars
//		if (containsOnlyNumbers(word)) return false; //maybe replace with a number token
		return !list.containsKey(generateKey(word).toString());		
	}

	public boolean isUsefulForComparison(String word, int min){
		word = Singularize(word).toUpperCase().trim();
		if (word.length()<min) {
			return false; //removes file extensions
		}
		return !list.containsKey(generateKey(word).toString());		
	}


	public static boolean sisUsefulForComparison(String word, int min){
		word = Singularize(word).toUpperCase().trim();
		if (word.length()<min) return false; //removes file extensions
		return true; 		
	}

	
	private static int decodeSingle(char letter) {
        switch (letter) {
            case 'M':
                return 1000;
            case 'D':
                return 500;
            case 'C':
                return 100;
            case 'L':
                return 50;
            case 'X':
                return 10;
            case 'V':
                return 5;
            case 'I':
                return 1;
            default:
                return 0;
        }
    }
	
	public static String getNumber(String word) {
		if (word.matches("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$")){
			   
			        int result = 0;
			        String uRoman = word.toUpperCase(); //case-insensitive
			        for (int i = 0; i < uRoman.length() - 1; i++) {//loop over all but the last character
			            if (decodeSingle(uRoman.charAt(i)) < decodeSingle(uRoman.charAt(i + 1))) {
			                result -= decodeSingle(uRoman.charAt(i));
			            } else {
			                result += decodeSingle(uRoman.charAt(i));
			            }
			        }
			        result += decodeSingle(uRoman.charAt(uRoman.length() - 1));
			        return ""+result;
		} else if (!word.matches("\\w*[a-zA-Z]\\w*")){
			try {
				return NumberFormat.getInstance().parse(word).toString();
			} catch (ParseException e) {
				return word;
			}
		} else return word;
	}
	
	public static boolean containsAnyNumbers(String str) {
		//It can't contain only numbers if it's null or empty...
		if (str == null || str.length() == 0)
			return false;

		//roman numeral
		if (str.toUpperCase().matches("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$")) 
			return true;
		//not a word
		if (str.matches("[a-zA-Z]*")) return false;
		
		
		return true;
	}

	public void setListType(int listType) {
		this.listType = listType;
	}

	public int getListType() {
		return listType;
	}

	private static boolean seemsPluralised(String name) {
		boolean es = name.toLowerCase().endsWith("es");
		boolean ies = name.toLowerCase().endsWith("ies");
		boolean s = name.toLowerCase().endsWith("s");
		boolean ss = name.toLowerCase().endsWith("ss");
		boolean result = name.length()>3 && ((es || s || ies) && !ss);
		return result;
	}

	public static String Singularize(String name){
		String result = name;

		if (seemsPluralised(name)) {
			if (name.toLowerCase().endsWith("ies")) {
				// cities --> city
				result = name.substring(0, name.length() - 3) + "y";
			} else if (name.toLowerCase().endsWith("s")) {
				// customers --> customer
				result = name.substring(0, name.length() - 1);
			}
		}        
		return result;
	}

	@Override
	public String generateKey(String word){
		word = word.toUpperCase().trim().replaceAll("[\\x01-\\x1F]", "");
		if (isPhonetic){
			String[] words = WordList.split(word);
			StringBuilder sb = new StringBuilder();
			for (String wordo : words)
				sb.append(NYSIIS.encode(wordo)); 
			word = sb.toString();
		}
		return word;
	}

	public static String deInitialise(String in){
		String temp=in+" ";
		int origlen=temp.length();
		String temp2=temp.replaceAll(" ", "");
		int newlen =temp2.length();
		if (origlen-newlen>=newlen)
			return temp2;
		else
			return in;
	}

	public static String[] split(String value) {
		//value = value.replaceAll("(\\d)\\-(\\d)", "$1#dash#$2");
		String[] res = value.toLowerCase()
				.replaceAll("'", "").split("(\\.\\x20)|\\x20|\\-|,|!|\\?|:|;|&|<|>|=|\\r|\\n|\\t|=|\\+|\\*|/|\\)|\\(|\\]|\\[|\\{|\\}|&");
		ArrayList<String> temp=new ArrayList<String>();
		for (String newvalue : res){
			newvalue=newvalue.replaceAll("\\.","").trim();
			if (newvalue.length()>0)
				temp.add(newvalue);
		}
		return temp.toArray(new String[temp.size()]);
	}

	public static void main(String[] args){
		for (String i : WordList.split("ABN AMRO Bank N.V {New York]"))
			System.out.println(i);
	}
	
	public static String clean(String val) {
		// TODO Auto-generated method stub
		return val.replaceAll("[|\\.|,|!|?|:|;| |(|)|&|@|<|>|'|\"|\\[|\\]|(|)|/|\n|\r|\t|=|]|\\+|\\-|\\s|\\*", "").trim().toUpperCase();
	}

	public static String cleanLeaveDashes(String val) {
		// TODO Auto-generated method stub
		return val.replaceAll("[|\\.|,|!|?|:|;| |(|)|&|@|<|>|'|\"|\\[|\\]|(|)|/|\n|\r|\t|=|]|\\+|\\s|\\*", "").trim().toUpperCase();
	}

	public static String cleanLeaveSpaces(String val) {
		// TODO Auto-generated method stub
		if (val!=null)
		  return val.replaceAll("[|:|;|(|)|'|\"|\\[|\\]|(|)|]|\n|\r|\\p{C}|\\x00\\x08\\x0B\\x0C\\x0E-\\x1F|\\u2122", "").trim();
		else
			return "";
	}
	
	public boolean load() throws IdentizaException{

		return false;
	}
	public String startsWith(String fullValue) {
		for (String key : list.keySet())
			if (fullValue.startsWith(list.get(key).getWord()))
				return list.get(key).getWord();
		return "";
	}
	public String removeAnon(String fullValue) {
		for (String key : list.keySet())
			if (fullValue.contains(list.get(key).getWord())){
				fullValue = fullValue.replaceAll(list.get(key).getWord(), "");
			}
		return fullValue;
	}



	public static String serialiseArrayString(ArrayList<String> arrayList) {
		// return a list like this: 1.56;3.3;4.56 from the list of the strings
		String res="";
		for (String sr : arrayList){
			res=res+";"+sr;
		}
		if (res.length()>1)
			return res.substring(1);
		else 
			return "";		
	}


	public static ArrayList<String> deserialiseArrayString(String in) {
		// return a list like this: 1.56;3.3;4.56 from the list of the strings
		String[] list = in.split(";");
		ArrayList<String> res = new ArrayList<String>();
		for (String s: list)
			res.add(s);
		return res;
	}


	public boolean isBadWords(String[] words,
			String[] comparitorWords) {
		//scan through the bad words and look to see if the comparitor has any matches
		
		for (String word : words){
			//if (word.equalsIgnoreCase("1"))
			//	System.out.println(word);
			WordObject wordo=this.getWord(word);
			if (wordo!=null){
				Set<String> badWords = wordo.getDiffParents();
				if (badWords.size()>0)
					for (String comparitor : comparitorWords){
						for (String temp : badWords){
						  //if (temp.trim().contains("%"))
							
						  if (temp.trim().equalsIgnoreCase(comparitor.trim())){
							//System.out.println(word + " is objecting to being matched to " + comparitor);
							   return true;
						  } 
						}
					}				
			}
		}
		//if (Arrays.toString(words).equalsIgnoreCase("[BANK, OF, TOKYO, CM, SERIES]"))
		//   System.out.println(Arrays.toString(words) + "->" + Arrays.toString(comparitorWords));
		return false;
	}
	@Override
	public boolean load(boolean childisparent) throws IdentizaException {
		// TODO Auto-generated method stub
		return false;
	}
	@Override
	public void setChildisParent(boolean childisparent) throws IdentizaException {
		// TODO Auto-generated method stub
		
	}
	public void clear() {
		list.clear();
		
	}
	public static String printDifference(Date startDate, Date endDate){

		//milliseconds
		long different = endDate.getTime() - startDate.getTime();


		long secondsInMilli = 1000;
		long minutesInMilli = secondsInMilli * 60;
		long hoursInMilli = minutesInMilli * 60;
		long daysInMilli = hoursInMilli * 24;

		long elapsedDays = different / daysInMilli;
		different = different % daysInMilli;

		long elapsedHours = different / hoursInMilli;
		different = different % hoursInMilli;

		long elapsedMinutes = different / minutesInMilli;
		different = different % minutesInMilli;

		long elapsedSeconds = different / secondsInMilli;

		if (elapsedDays>0)
		return (elapsedDays + " day "+ elapsedHours +" hours");
		else if (elapsedHours>0)
			return elapsedHours +" hours "+ elapsedMinutes + " min";
		else if (elapsedMinutes>0) return elapsedMinutes+":"+elapsedSeconds;
		else if (elapsedSeconds>0) return elapsedSeconds +"s";
		else return "0 min";
		    

	}

	
	

}
