
/*******************************************************************************
 * Copyright notice
 * 
 * This source code is copyright of Robert James Haynes - (c) 2010, 2011. All rights reserved.
 * 
 * Any redistribution, reproduction or decompilation of part or all of the code in any form is prohibited 
 * 
 * You may not, except with our express written permission, distribute or commercially exploit the content. Nor may you transmit it or store it in or display it on any website or other form of electronic retrieval system.
 ******************************************************************************/
/**
 *
	Identiza - Fuzzy matching Libraries
    
    Copyright (C) 2019  Robert James Haynes (EntityStream KFT), Budapest Hungary

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see https://www.gnu.org/licenses/agpl-3.0.en.html
 */
package com.entitystream.identiza.entity.resolve.types;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;

public class MatchSuspect extends MatchBase implements Serializable{

	public MatchSuspect(String name, int minwidth, int maxwidth) throws Exception{
		super(name, minwidth, maxwidth);
		partialKeys=false;
	}

	public MatchSuspect (String name, int minwidth, int maxwidth, int start, int end,int gradient, String keyField)  throws Exception{
		super(name, minwidth, maxwidth,"SUSPECT");
		partialKeys=false;
	}

	public MatchSuspect(String name){
		super(name);
		partialKeys=false;
	}

	@Override
	public Collection<String> getKeys(Object value, boolean batch){		
		if (value instanceof String){
			value = new String[]{(String)value};
		}
		//input will be an array of strings - each represents a field
		//["MALE", "16-20", etc]
		//keys should be generated in the format MALE|16|WHITE|BLACK|SLIM
		//if the age is a range then each age should generate a key 
		//if 1-20 is the age range then a key should be generated for each age
		//if 20- keys should be in multiples of 5
		//words like build should be translated to their FIRST parent ie MED -> MEDIUM
		HashMap<String, ArrayList<String>> alternates = new HashMap<String, ArrayList<String>>();
		int pos=-1;
		for (String stringVal : (String[])value){
			if (stringVal!=null){
				//determine whether the field is a numeric - if so its likely a age or height
				boolean prefixedUnder = stringVal.toLowerCase().startsWith("under");
				boolean prefixedOver  = stringVal.toLowerCase().startsWith("over");
				boolean isAge=false;
				double lowernumber=0;
				double highernumber=100;

				pos++;
				if (stringVal.length()>0 && "0123456789".contains(""+(stringVal.trim().substring(0,1))) || prefixedUnder || prefixedOver){

					if (prefixedUnder){					
						lowernumber=0;	
						highernumber=Double.parseDouble(stringVal.substring(6).trim());
					} else
						if (prefixedOver){
							lowernumber=Double.parseDouble(stringVal.substring(5).trim());
							highernumber=100;
						} else {
							//its a number or range
							if (stringVal.contains("-") || stringVal.contains(":")){
								//range
								String[] split = stringVal.split("\\-|:");
								if (split.length>0){
									try{
									lowernumber=Double.parseDouble(split[0]);
									} catch (NumberFormatException e){}
								} 
								if (split.length>1){
									try{
									highernumber=Double.parseDouble(split[1]);
									} catch (NumberFormatException e){}
								}
							} else {
								//sole number
								try{
								lowernumber=Double.parseDouble(stringVal);
								highernumber=lowernumber;
								} catch (NumberFormatException e){}
							}
						}
					//if the numbers are decimals then multiply them by 100 and integer them
					if (Math.round(lowernumber)!=lowernumber || Math.round(highernumber)!=highernumber){
						lowernumber=Math.round(lowernumber*100);
						highernumber=Math.round(highernumber*100);
					}
					if (lowernumber!=highernumber){					
						if (!alternates.containsKey(""+pos))
							alternates.put(""+pos, new ArrayList<String>());				
						int increment=1;
						if (highernumber>25)
							increment=5;
						for (int num=(int)Math.round(lowernumber); num<=highernumber; num=num+increment){
							alternates.get(""+pos).add(""+num);								
						}
					} else {
						//only one number
						if (!alternates.containsKey(""+pos))
							alternates.put(""+pos, new ArrayList<String>());
						alternates.get(""+pos).add(""+(int)Math.round(lowernumber));
					}
				}			
				else {
					//its not a number
					if (!alternates.containsKey(stringVal)){
						alternates.put(""+pos, new ArrayList<String>());
					}
					alternates.get(""+pos).addAll(getAlternateWords(stringVal, getKeyAnon(), getKeyCv()));
				}
			}
		}
		//construct key from all combinations		
		ArrayList<ArrayList<String>> temp = new ArrayList<ArrayList<String>>();				
		for (int ppos = 0; ppos<=pos; ppos++){
			ArrayList<String> res = alternates.get(""+ppos);
			if (res!=null && res.size()>0)
				temp.add(res);
		}

		if (temp.size()>0){			
			return processKeyInOrder(temp, 0, "", partialKeys);
		}
		else
			return null;

	}

	@Override
	public String getKey(String word) {
		if (word==null || word.length()==0)
			word="UNKNOWN";
		return word.toUpperCase().trim().replaceAll(" ", "");	

	}

	public static void main(String[] args){
		System.out.println("HELO");
		MatchSuspect susp = new MatchSuspect("");
		Collection<String> ret = susp.getKeys(new String[]{"MALE", "", "1.6-2.0", "", "WHITE EUROPEAN"}, false);
		for (String s : ret){
			System.out.println(s);
		}
	}
}

