/**
 *
	Identiza - Fuzzy matching Libraries
    
    Copyright (C) 2019  Robert James Haynes (EntityStream KFT), Budapest Hungary

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see https://www.gnu.org/licenses/agpl-3.0.en.html
 */
package com.entitystream.identiza.entity.resolve.types;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import com.entitystream.identiza.entity.resolve.processing.IdentizaException;
import com.entitystream.identiza.language.utils.zh.ZHConverter;
import com.entitystream.identiza.wordlist.AnonymousWordList;
import com.entitystream.identiza.wordlist.ControlledVocabularyWordList;
import com.entitystream.identiza.wordlist.WordList;
import com.entitystream.identiza.wordlist.WordObject;

public class MatchChinesePersonName extends MatchBase  implements Serializable{
	public MatchChinesePersonName(String name,int minwidth, int maxwidth)  throws Exception{
		super(name, minwidth, maxwidth);
	}

	public MatchChinesePersonName(String name, int minwidth, int maxwidth, int start, int end,int gradient)  throws Exception{
		super(name, minwidth, maxwidth, "MATCHZHPERSONNAME");
		maxScore = MatchProcInterface.MATCH_VCLOSE;
	}

	public MatchChinesePersonName(String name) {
		super(name);
		partialKeys=false;		

	}
	@Override
	public double calculateComparisonScore(Standardized stdBase, Standardized stdComp, boolean isSearch, boolean asContent){   
		return stdBase.compare(stdComp, getRuleAnon(), getRuleCv(), isSearch, asContent);
	}
	
	@Override
	public Standardized standardise(String originalText, String[] words){
		return new StandardChinesePersonName(words, getRuleAnon(), getRuleCv(), gradient,ruleFunction);
	}
	
	
	//gets all the keys for the values passed in - this will call getKey() recursively
	@Override
	public Collection<String> getKeys(Object value, boolean batch){
		Collection<String> keys = new HashSet<String>();
		String[] values;
		if (value instanceof String){
			value = new String[]{(String)value};
		} 

		String fullValue = "";
		for (int j=0; j<((String[])value).length; j++){
			if (((String[])value)[j]!=null)
				fullValue = fullValue + " " + ((String[])value)[j];
		}
		fullValue = fullValue.trim();
		fullValue = getKeyAnon().removeAnon(fullValue);
		if (fullValue.length()>0){
			//find surname based on cv
			String surname = "";
			String firstname = "";
			ArrayList<HashSet<String>> surnameAlternates = new ArrayList<HashSet<String>>();
			ArrayList<HashSet<String>> firstnameAlternates = new ArrayList<HashSet<String>>();
			if (( (Character.UnicodeBlock.of(fullValue.charAt(0)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS))) {
				//starts with chinese text
				if (getKeyCv()!=null){
					surname = getKeyCv().startsWith(fullValue);
					if (surname.length()==0)
						surname = ""+fullValue.charAt(0);
				} else if (fullValue.length()>0){  //assume 1 char
					surname = ""+fullValue.charAt(0);
				}
				firstname =fullValue.substring(surname.length());
				surnameAlternates = convertToPinyin(surname);
				firstnameAlternates = convertToPinyin(firstname);
			} else {
				//latin text - assume last name is the surname
				surname = fullValue.substring(0,fullValue.indexOf(" "));
				firstname = fullValue.substring(fullValue.indexOf(" ")+1);
				surnameAlternates=getAlternates(surname);
				firstnameAlternates=getAlternates(firstname);
			}

			HashSet<String> surnameKeys = processKey(surnameAlternates , 0, "", false, false, minWidth);
			HashSet<String> firstnameKeys = processKey(firstnameAlternates , 0, "", true, false, minWidth);

			//for each combination - concatenate the keys
			for (String surnameKey : surnameKeys){
				if (firstnameKeys.size()>0){
					for (String firstnameKey : firstnameKeys){
						keys.add(surnameKey+firstnameKey);
					}	
				} else keys.add(surnameKey);
			}
		}
		//generateKeys
		return keys;
	}
	
	/*
	@Override
	protected Collection<String> processKey(ArrayList<ArrayList<String>> alternates, int startpos, String keySoFar, boolean partialKeys){		
		Set<String> allKeys = new HashSet<String>();
		if (alternates.size()>0){
			for (String comb1 : alternates.get(startpos)){
				String newKey="";
				if (keySoFar.length()>0)
					newKey = keySoFar+comb1;
				else
					newKey = comb1;
				if (startpos<alternates.size()-1)
					allKeys.addAll(processKey(alternates, startpos+1, newKey, partialKeys));
				else allKeys.add(newKey);
			}
		}

		return allKeys;
	}
	*/
	
	private ArrayList<HashSet<String>> getAlternates(String latinText) {
		ArrayList<HashSet<String>> ret = new ArrayList<HashSet<String>>();
		String[] words = WordList.split(latinText);
		for (String word : words){
			ret.add(getAlternateWords(word, getKeyAnon(), getKeyCv()));
		}
		return ret;
	}
	
	//this does nothing because it needs to return multiple keys - so this is now done in getKeys()
	@Override
	public String getKey(String word){
		return word.toUpperCase().replaceAll("[0-9]", "").replaceAll("`", "");
	}

		private HashSet<String> toPinyin(String traditionalStr) {
		// get Simplified text
		ZHConverter converter = ZHConverter.getInstance(ZHConverter.SIMPLIFIED);
		String simplifiedStr = converter.convert(traditionalStr);

		// pinyin simplfied and traditional text 
		HashSet<String> words = new HashSet<String>();
		if (getKeyCv()!=null){
			for (int i =0; i<simplifiedStr.length(); i++){
				try{
					//concatentateArray(PinyinHelper.toHanyuPinyinStringArray(simplifiedStr.charAt(i),outputFormat), words);
					String keyTrad = Integer.toHexString(simplifiedStr.charAt(i)).toUpperCase();
					String keySimp = Integer.toHexString(traditionalStr.charAt(i)).toUpperCase();
					WordObject sim = getKeyCv().getWord(keySimp);
					WordObject tra = getKeyCv().getWord(keyTrad);
					if (sim!=null)
						concatentateArray(sim.getSameParents(), words);
					else if (tra!=null)
						concatentateArray(tra.getSameParents(), words);
				} catch(Exception e){
					e.printStackTrace();
				}
			}
		}
		return words;
	}
	private ArrayList<HashSet<String>> convertToPinyin(String fullValue){
		//iterate through the string - convert chinese to pinyin
		ArrayList<HashSet<String>> phoneticText = new ArrayList<HashSet<String>>();
		for (int i=0; i< fullValue.length(); i++){
			//for each character - create a new list
			//add to the list the alternate values
			HashSet<String> wordL = toPinyin(""+fullValue.charAt(i));
			phoneticText.add(wordL);
		}
		/*for (String word : toPinyin(fullValue)){
			ArrayList<String> wordL = new ArrayList<String>();
			wordL.add(word);
			phoneticText.add(wordL);
		}*/
		return phoneticText;
	}
	
	private void concatentateArray(ArrayList<String> in, HashSet<String> out){
		for (String word : in){
			if (!out.contains(word))
				out.add(getKey(word));
		}
	}
}
