/**
 *
	Identiza - Fuzzy matching Libraries
    
    Copyright (C) 2019  Robert James Haynes (EntityStream KFT), Budapest Hungary

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see https://www.gnu.org/licenses/agpl-3.0.en.html
 */
package com.entitystream.identiza.entity.resolve.types;

import java.io.Serializable;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import java.util.logging.Logger;

import com.entitystream.identiza.entity.resolve.utils.EditDistance;
import com.entitystream.identiza.entity.resolve.utils.NYSIIS;
import com.entitystream.identiza.wordlist.WordList;

public class MatchAddress extends MatchBase implements Serializable{
	Logger logger = Logger.getLogger("com.identiza");

	public MatchAddress(String name, int minwidth, int maxwidth) throws Exception{
		super(name, minwidth, maxwidth);
	}

	public MatchAddress(String name, int minwidth, int maxwidth, int start, int end,int gradient, String keyField) throws Exception {
		super(name, minwidth, maxwidth, "ADDRESS");
	}

	public MatchAddress(String name){
		super(name);	
	}

	@Override
	public double calculateComparisonScore(Standardized stdBase, Standardized stdComp, boolean isSearch, boolean asContent){   
		return stdBase.compare(stdComp, getRuleAnon(), getRuleCv(), isSearch, asContent);
	}
	
	@Override
	public Standardized standardise(String originalText,String[] words){
		return new StandardAddress(words, this.getRuleAnon(), this.getRuleLookup(), gradient,ruleFunction);
	}
	
	@Override
	public Collection<String> getKeys(Object value, boolean batch){
		//Important keys are street number, street name and city name
		//key combinations are NONAMECITY, NOCITYNAME, NAMECITY, CITYNAME, NONAME, NAMENO
		//concat all words together
		Set<String> keys = new HashSet<String>();
		String fullValue="";
		if (value instanceof String){
			value = new String[]{(String)value};
		}
		for (int j=0; j<((String[])value).length; j++){
			if (((String[])value)[j]!=null)
				fullValue = fullValue + " " + ((String[])value)[j];
		}
		fullValue = fullValue.trim();

		//is it a POBOX? or STREET number or UNIT?
		//if its preceeded by the POBOX token then its a pobox
		//else if its got a / in it its both STREET and UNIT
		//else if its before the STREETNAME token its a STREET NUMBER
		//else if its after the STREETNAME token its a UNIT (US)

		String[] words = WordList.split(fullValue);
		String[] pobox = new String[]{"BOX", "POBOX", "PO", "PBOX"};
		String[] streetnames = new String[] {"ALLEY", "ALY", "APR", "APPROACH", "AUTOBAHN", "AUTO-ESTRADA", "AUTOROUTE", "AUTOSTRADA", "AUTOSTRASSE", "AVE", "AVENUE", "BAY", "BLVD", "BOULEVARD", "BYP", "BYPASS", "BYWAY", "CANYON", "CAUSEWAY", "CIR", "CIRC", "CIRCLE", "CIRCUS", "CLOSE", "COURT", "COVE", "CRES", "CRESCENT", "CROFT", "CSWY", "CT", "CULS-DE-SAC ", "CV", "CYN", "DELL", "DR", "DRIVE", "ESP", "ESPLANADE", "EXPRESSWAY", "EXPY", "FREEWAY", "FRONTAGE", "FWY", "GARDENS", "GARTH", "GATE", "GDNS", "GRADE", "GREEN", "GRN", "GROVE", "GRV", "HEIGHTS", "HIGHLANDS", "HIGHWAY", "HILL", "HL", "HTS", "HWY", "KEY", "KNL", "KNOLL", "LANE", "LAWN", "LN", "LOOP", "MANOR", "MEWS", "MNR", "MOTORWAY", "MOUNT", "MT", "MTRWY", "MWAY", "NENE", "NOOK", "OVAL", "PARADE", "PARK", "PARKWAY", "PASSAGE", "PATHWAY", "PDE", "PIKE", "PKWY", "PL", "PLACE", "PLAZA", "PLZ", "PROM", "PROMENADE", "PSGE", "PTHWY", "QUADRANT", "QUAY", "RD", "RISE", "ROAD", "ROW", "SPUR", "SQ", "SQUARE", "ST", "STRA", "STRAVENUE", "STREET", "TCE", "TER", "TERRACE", "TPK", "TRACE", "TRAIL", "TRCE", "TRL", "TURNPIKE", "VALE", "VIEW", "VW", "WALK", "WAY"};
		//find POBOX
		int poboxpos = EditDistance.findToken(words, pobox);
		//find STREET TOKEN
		int streetTypePos = EditDistance.findToken(words, streetnames);
		//find if there is a flat
		int flatpos = EditDistance.findToken(words, new String[] {"FLAT", "FL", "UNIT", "U", "APT", "APARTMENT", "SHOP", "BUILDING", "BLDG", "SUITE", "SUITES","/"});

		String city = "";
		if (poboxpos>-1){
			//find first number after the po box token
			for (int i=poboxpos+1; i<words.length; i++){
				if ("0123456789".contains(""+words[i].charAt(0))){

					if (words.length>i+1){ //is there anything left after po?
						for (int ii=streetTypePos+1; ii<words.length; ii++) //use it as the city
							city=city+getKey(words[ii])+":";
					}
					if (city.length()>0)
						city=city.substring(0,city.length()-1);

					//found pobox & no & have city
					keys.addAll(allCombs("PO" + words[i] + city, partialKeys, false, minWidth, maxWidth));

					break;
				}
			}
		}
		else 
			if (streetTypePos>-1){
				String streetnumber = "";
				int streetnumberpos=-1;
				String unitnumber = "";
				//find the flat number
				if (flatpos>-1 && words.length>flatpos+1){
					unitnumber=words[flatpos+1];
				} else flatpos=-2;
				//find first word with a number before the street - this is the streetnumber
				for (int i=flatpos+2; i<streetTypePos; i++){
					if ("0123456789".contains(""+words[i].charAt(0))){					
						streetnumberpos=i;
						if (words[i].contains("\\") || words[i].contains("/")){
							String[] numberparts = words[i].split("\\|/");
							streetnumber=numberparts[0];
							unitnumber=numberparts[1];
						}
						else
							streetnumber=words[i];
						break;
					}				
				}
				/*if (unitnumber.length()==0){ //only if we didnt find it
					for (int i=streetTypePos; i<words.length; i++){
						if (words[i].length()>0 && "0123456789".contains(""+words[i].charAt(0))){
							unitnumber=words[i];
							break;
						}				
					}
				}
*/
				//street name sits between the streetnumber and the street type
				String street = "";
				if (words.length>streetnumberpos){
					for (int i=streetnumberpos+1; i<streetTypePos; i++){
						street=street+getKey(words[i])+":";
					}
				}
				if (street.length()>0)
					street=street.substring(0,street.length()-1);

				//get city
				if (words.length>streetTypePos){ //is there anything left after po?
					for (int ii=streetTypePos+1; ii<words.length; ii++) //use it as the city
						city=city+getKey(words[ii]);
				}

				//generate tokens
				if (streetnumber.contains("-")){
					String numbers[] = streetnumber.split("-");
					if (numbers.length>1){
						try{
							int startNum = EditDistance.niceParseInt(numbers[0]);
							int endNum = EditDistance.niceParseInt(numbers[numbers.length-1]);
							for (int i=startNum; i<=endNum; i++)
								keys.add(i + street + city);
						} catch (NumberFormatException e){
						}
					} else
						keys.add(streetnumber + street + city);
				} else					
				    keys.add(streetnumber + street+city);
				if (unitnumber.length()>0){
					keys.add(unitnumber+street+city);					
				}
			} else { //no street token
				int streetnumberpos=-1;
				String unitnumber = "";
				String streetnumber="";
				//find the flat number
				if (flatpos>-1 && words.length>flatpos+1){
					unitnumber=words[flatpos+1];
				} else flatpos=-2;
				if (streetTypePos==-1)
					streetTypePos=words.length;
				//find first word with a number before the street (after the flat) - this is the streetnumber
				for (int i=flatpos+2; i<streetTypePos; i++){
					if ("0123456789".contains(""+words[i].trim().charAt(0))){					
						streetnumberpos=i;
						if (words[i].contains("\\") || words[i].contains("/")){
							String[] numberparts = words[i].split("\\|/");
							streetnumber=numberparts[0];
							unitnumber=numberparts[1];
						}
						else
							streetnumber=words[i];
						break;
					}				
				}
				//if there was no numbers - then we take all of it
				String streetAndCity = "";
				if (words.length>streetnumberpos){
					for (int i=streetnumberpos+1; i<words.length; i++){
						streetAndCity=streetAndCity+getKey(words[i])+":";
					}
				}
				if (streetAndCity.length()>0)
					streetAndCity=streetAndCity.substring(0,streetAndCity.length()-1);
				//generate tokens
				if (streetnumber.contains("-")){
					String numbers[] = streetnumber.split("-");
					if (numbers.length>1){
						int startNum = EditDistance.niceParseInt(numbers[0]);
						int endNum = EditDistance.niceParseInt(numbers[numbers.length-1]);
						for (int i=startNum; i<=endNum; i++)
							keys.add(i + streetAndCity);
					} else
						keys.add(streetnumber + streetAndCity);
				} else keys.add(streetnumber + streetAndCity);
				if (unitnumber.length()>0){
					keys.add(unitnumber + streetAndCity);
				}


			}
		return keys;

	}




	@Override
	public String getKey(String word){
		if (("0123456789").contains(word.substring(0,1)))
			return word;
		if (!word.equals("null") && word.length()>3 && getKeyAnon()==null || (getKeyAnon()!=null && getKeyAnon().isUsefulForComparison(word,2))){
			return NYSIIS.encode(word.toUpperCase().trim().replaceAll("[^A-Z0-9]", ""));
		} else return "";
	}


	public static void main(String[] args){
		MatchAddress ma = new MatchAddress("");
		ma.minWidth=2;
		ma.maxWidth=2;
		System.out.println(ma.getKeys("Unit 20 1 Daly St DARWIN NT 800",  false));
		System.out.println(ma.getKeys("Unit 20 1 Daly St DARWIN NT 800",  false));

	}
}
