/**
 *
	Identiza - Fuzzy matching Libraries
    
    Copyright (C) 2019  Robert James Haynes (EntityStream KFT), Budapest Hungary

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see https://www.gnu.org/licenses/agpl-3.0.en.html
 */
package com.entitystream.identiza.entity.resolve.types;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import com.entitystream.monster.db.Document;

import com.entitystream.identiza.entity.resolve.processing.GradientGenerator;
import com.entitystream.identiza.entity.resolve.processing.IdentizaException;
import com.entitystream.identiza.entity.resolve.utils.NYSIIS;
import com.entitystream.identiza.metadata.IdentizaSettings;
import com.entitystream.identiza.wordlist.RuleSet;
import com.entitystream.identiza.wordlist.WordList;

public class MatchCompanyName extends MatchBase implements Serializable{

	/**
	 * 
	 */
	private static final long serialVersionUID = 9019823593955645630L;


	public MatchCompanyName(String name, int minwidth, int maxwidth) throws Exception{
		super(name, minwidth, maxwidth);
	}

	public MatchCompanyName(String name, int minwidth, int maxwidth, int gradient)  throws Exception{
		super(name, minwidth, maxwidth, "MATCHCOMPANYNAME");
		maxScore = MatchProcInterface.MATCH_VCLOSE;
	}

	public MatchCompanyName(String name) {
		super(name);
		//partialKeys=false;		
	}

	@Override
	public double calculateComparisonScore(Standardized stdBase, Standardized stdComp, boolean isSearch, boolean asContent){
		return super.calculateComparisonScore(stdBase, stdComp, isSearch, asContent);
	}

	@Override
	public Standardized standardise(String originalText, String[] words){
		return new StandardCompanyName(words, this.getRuleAnon(), this.getRuleLookup(), gradient,ruleFunction);
	}

	@Override
	public Collection<String> getKeys(Object value, boolean batch){
		return getKeys(value, batch, false);
	}


	@Override
	public Collection<String> getKeys(Object value, boolean batch,boolean isSearch){
		/// IBM
		/// INTERNATIONAL BUSINESS MACHINES
		int width =0;
		if (value instanceof String){
			value = new String[]{(String)value};
		}
		Set<String> ret = new HashSet<String>();
		//each string is a completely separate name ie alias
		String fullValue = "";
		for (int j=0; j<((String[])value).length; j++){
			if (((String[])value)[j]!=null){
				fullValue = fullValue + " " + ((String[])value)[j];

			}
		}
		fullValue = fullValue.trim();

		//key is the original word, list of alternate keys			
		if (fullValue.length()>3) {
			if (fullValue.length()>255){
				fullValue=fullValue.substring(0,254);
			}
			String[] tempword1 = WordList.split(fullValue);//.split("[.,!?:; ()&-]+\\s*");
			width=tempword1.length;
			//compress separated initials
			ArrayList<String> tempwords = compressInitials(tempword1);
			String initials ="";
			String key = "";
			String mandkey = "";
			for (String oword : tempwords){
				//no company should have more than 5 parts to the name
				String word = WordList.clean(oword);
				if (word!=null && word.trim().length()>0){
					if((getKeyAnon()!=null && getKeyAnon().isUsefulForComparison(word,0))){																		
						if(word.length()>1){
							if (!WordList.containsAnyNumbers(oword)){
								String enc = getKey(word);
								if (enc.length()>1){
									key=key+enc+":";
								}
								initials+=word.charAt(0);
							} else
							{
								String enc = WordList.getNumber(oword).toString();
								if (enc.length()>0){
									if (!enc.equalsIgnoreCase(oword)){
									   mandkey=mandkey+enc.trim().toUpperCase();
								    } else 
								    	key=key+getKey(word);
								}
							}
						} else {
							String enc = WordList.getNumber(oword).toString();
							if (!enc.equalsIgnoreCase(oword))
						 	   mandkey=mandkey+enc.toUpperCase().trim();
							else key=key+enc.toUpperCase()+":";
						}
					}
				}
			}
			//if (initials.length()>2)
			//	ret.add(initials);
			if (key.length() + mandkey.length()>2) {
//				if (!isSearch){
					if (key.startsWith(":"))
						key = key.substring(1);
					if (mandkey.startsWith(":"))
						mandkey = key.substring(1);
					String[] combs = key.split(":");
					
					String part="";
					for (int t=1; t<=maxWidth; t++){
						if (t<=combs.length){
							part+=(combs[t-1]);
							if (t>=minWidth || ((combs.length<minWidth) && (t==combs.length)))
								ret.add(part+mandkey);
						}
					}
					if (isSearch){
						part="";
						for (int t=1; t<=maxWidth; t++){
							if (t<combs.length){
								part+=(combs[t]);
								if (t>=minWidth || ((combs.length<minWidth) && (t==combs.length)))
									ret.add(part+mandkey);
							}
						}
					}
//				}
//				else
//					ret.addAll(allCombs(key, false, true,  Math.min(width, minWidth) , Math.min(width, maxWidth)));
			}
		} else {
			//short keys are special
			ret.add(fullValue.toUpperCase());
		}
		//String[] combs = key.split(":");
		//take the first and last and first and second tokens
		//generate the combinations
		//if (combs.length>1){
		//  ret.add(combs[0].charAt(0)+combs[1]);		
		//  ret.add(combs[0].charAt(0)+combs[combs.length-1]);
		//}	

		if (!isMandatory && ret.size()==0)
			ret.add("");
		return ret;
	}




	@Override
	public String getKey(String word){
		return NYSIIS.encode(word.toUpperCase().trim().replaceAll("\\s|\\.,+", ""));
	}


	public static void main(String[] args){
		MatchCompanyName mcn = new MatchCompanyName("X");
		mcn.minWidth=1;
		mcn.maxWidth=3;
		mcn.setRuleSet(RuleSet.emptyRuleSet());
		for (String k : mcn.getKeys("Α Ε Β Ε ΚΟΝΤΟΒΕΡΟΣ ", false, false))
			System.out.println(k);
		for (String k : mcn.getKeys("Α. Ε. Β. Ε. ΚΟΝΤΟΒΕΡΟΣ ", false, false))
			System.out.println(k);
		for (String k : mcn.getKeys("Α.Ε.Β.Ε. ΚΟΝΤΟΒΕΡΟΣ ", false, false))
			System.out.println(k);
		
		/*
		MatchCompanyName mcn2 = new MatchCompanyName("X");
		WordList anon=null;
		WordList cv=null;
		mcn.minWidth=2;
		mcn.maxWidth=3;
		mcn2.minWidth=2;
		mcn2.maxWidth=3;
		
		MongoClient mongo = MongoDBProvider.getStaticMongoClient("mongodb://localhost:27017");
		Iterator<Document> docs = SettingsFactory.masterFind("Rules",new Document("rulePurpose", "MatchCompanyName"));
		RuleSet rules = RuleSet.createRuleSet(docs);
		mcn.setRuleSet(rules);
		mcn2.setRuleSet(rules);
		
		mcn.setGradient(GradientGenerator.LEFTHIGH);
		mcn2.setGradient(GradientGenerator.LEFTHIGH);
		mcn.setRuleSet(RuleSet.emptyRuleSet());

		Standardized stdBase= new StandardCompanyName("GENERAL MOTOR".split(" "), rules.getRuleAnon(),rules.getRuleLookup(),GradientGenerator.MIDDLELOW,"");
		Standardized stdComp = new StandardCompanyName("GENERAL MOTORS INVESTMENT MANAGEMENT".split(" "), rules.getRuleAnon(),rules.getRuleLookup(),GradientGenerator.MIDDLELOW,"");
		System.out.println(stdBase.toDocument().toJson() + " vs " + stdComp.toDocument().toJson() + "="+mcn.calculateComparisonScore(stdBase, stdComp, false, false));
		
		stdBase= new StandardCompanyName("GENERAL MOTOR".split(" "), rules.getRuleAnon(),rules.getRuleLookup(),GradientGenerator.MIDDLELOW,"");
		stdComp = new StandardCompanyName("GENERAL MOTORS".split(" "), rules.getRuleAnon(),rules.getRuleLookup(),GradientGenerator.MIDDLELOW,"");
		System.out.println(stdBase.toDocument().toJson() + " vs " + stdComp.toDocument().toJson() + "="+mcn.calculateComparisonScore(stdBase, stdComp, false, false));
		
		*/

}
}
