/**
 *
	Identiza - Fuzzy matching Libraries
    
    Copyright (C) 2019  Robert James Haynes (EntityStream KFT), Budapest Hungary

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see https://www.gnu.org/licenses/agpl-3.0.en.html
 */
package com.entitystream.identiza.entity.resolve.types;

import java.io.Serializable;
import java.util.ArrayList;

import com.entitystream.identiza.entity.resolve.processing.GradientGenerator;
import com.entitystream.identiza.entity.resolve.utils.EditDistance;
import com.entitystream.identiza.language.utils.zh.ZHConverter;
import com.entitystream.identiza.wordlist.WordList;

public class StandardChinesePersonName extends StandardizedSerialized   {
	//to store the char tokens
	public ArrayList<String> firstnamechars = new ArrayList<String>();
	public ArrayList<String> surnamechars = new ArrayList<String>();
	public ArrayList<String> firstnamecharspy = new ArrayList<String>();
	public ArrayList<String> surnamecharspy = new ArrayList<String>();
	public int tokensize;
	private WordList cv;
	private WordList anon;
	private int gradient;
	


	public StandardChinesePersonName()
	{
		
	}
	public StandardChinesePersonName(String[] words, WordList anon, WordList cv, int gradient, String name){
		super(words, anon, cv, gradient,name);
		
		this.anon=anon;
		this.cv=cv;
		this.tokensize=words.length;
		this.gradient=gradient;
		ZHConverter converter = ZHConverter.getInstance(ZHConverter.SIMPLIFIED);

		String val = "";
		for (String token : words){
			val = val + converter.convert(token) + " " ;
		}
		val = anon.removeAnon(val);
		String basenames[] = extractNames(val.trim());

		
		//compare the surnames in chinese first
		for (int i=0; i< basenames[1].length(); i++){
			surnamechars.add(""+basenames[1].charAt(i));
		}
		
		//compare the firstname in chinese
		for (int i=0; i< basenames[0].length(); i++){
			firstnamechars.add(""+basenames[0].charAt(i));
		}
		

		//compare the surnames in pinyin
		surnamecharspy = new ArrayList<String>();
		for (int i=0; i< basenames[1].length(); i++){
			surnamecharspy.add(Integer.toHexString(basenames[1].charAt(i)));
		}

		//compare the firstnames in pinyin
		for (int i=0; i< basenames[0].length(); i++){
			firstnamecharspy.add(Integer.toHexString(basenames[0].charAt(i)));
		}
	}
	//calculates the more accurate score based on word similarity etc - the base tokens are actual words
	
	private String[] extractNames(String fullValue) {
		String firstname= new String();
		String surname=new String();
		if (( (Character.UnicodeBlock.of(fullValue.charAt(0)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS))) {
			//starts with chinese text
			if (cv!=null){
				surname = cv.startsWith(fullValue);
				if (surname.length()==0)
					surname = ""+fullValue.charAt(0);
			} else if (fullValue.length()>0){  //assume 1 char
				surname = ""+fullValue.charAt(0);
			}
			firstname =fullValue.substring(surname.length());
		} else {
			//latin text - assume last name is the surname
			surname = fullValue.substring(0,fullValue.indexOf(" "));
			firstname = fullValue.substring(fullValue.indexOf(" ")+1);
		}

		return new String[]{firstname, surname};
	}


	@Override
	public double compare(Standardized otherstd, WordList anon, WordList cv,
			 boolean isSearch, boolean asContent) {
		double[] gradients;
		StandardChinesePersonName stdcpn=(StandardChinesePersonName)otherstd;
		gradients = GradientGenerator.generate(Math.min(this.tokensize,stdcpn.tokensize), Math.max(this.tokensize,stdcpn.tokensize), 2.0, 0.0, 1.0, gradient);
		double divisor=GradientGenerator.sum(gradients);

		double sneditzh = EditDistance.getLevenshteinDistanceOfStringArrayList(surnamecharspy, stdcpn.surnamecharspy, anon, cv, gradients);		
		double fneditzh = EditDistance.getLevenshteinDistanceOfStringArrayList(firstnamecharspy, stdcpn.firstnamecharspy, anon, cv, gradients);
		double snedit = EditDistance.getLevenshteinDistanceOfStringArrayList(surnamechars, stdcpn.surnamechars, anon, cv, gradients);
		double fnedit = EditDistance.getLevenshteinDistanceOfStringArrayList(firstnamechars, stdcpn.firstnamechars, anon, cv, gradients);

		//surnames are quite common - so have less importance
		double edit = fnedit + snedit;
		double editzh = fneditzh + sneditzh;
		double factor=100.0;
		if (edit>editzh){
			edit=editzh;
			factor=80.0;
		}
		int words = 0;
		double ret =0;
		
		if (isSearch) {
			if (edit>0)
				edit=edit-Math.abs(this.tokensize-stdcpn.tokensize);
			if (edit<0) edit=0;
			ret = factor*(divisor-edit)/divisor;
		}
		else{
			words=Math.max(this.tokensize, stdcpn.tokensize);
			ret = 100*(divisor-edit)/divisor;
		}
		return ret;
	}

}
