/**
 *
	Identiza - Fuzzy matching Libraries
    
    Copyright (C) 2019  Robert James Haynes (EntityStream KFT), Budapest Hungary

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see https://www.gnu.org/licenses/agpl-3.0.en.html
 */
package com.entitystream.identiza.entity.resolve.utils;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import com.entitystream.identiza.wordlist.WordList;

public class MinHash<T> {

	private static final int NGRAMS = 0;
	private int hash[];
	private int numHash;
	private static int NHASH=500;

	/**
	 * 
	 */
	public MinHash(int numHash){
		this.numHash = numHash;
		hash = new int[numHash];

		Random r = new Random(11);
		for (int i = 0; i < numHash; i++){
			int a = (int)r.nextInt();
			int b = (int)r.nextInt();
			int c = (int)r.nextInt();
			int x = hash(a*b*c, a, b, c);
			hash[i] = x;
		} 
	}


	public double similarity(Set<T> set1, Set<T> set2){

		int numSets = 2;
		Map<T, boolean[]> bitMap = buildBitMap(set1, set2);

		int[][] minHashValues = initializeHashBuckets(numSets, numHash);

		computeMinHashForSet(set1, 0, minHashValues, bitMap);
		computeMinHashForSet(set2, 1, minHashValues, bitMap);

		return computeSimilarityFromSignatures(minHashValues, numHash);
	}

	/**
	 * 
	 */
	 private static int[][] initializeHashBuckets(int numSets, int numHashFunctions) {
		 int[][] minHashValues = new int[numSets][numHashFunctions];

		 for (int i = 0; i < numSets; i++) {
			 for (int j = 0; j < numHashFunctions; j++) {
				 minHashValues[i][j] = Integer.MAX_VALUE;
			 }
		 }
		 return minHashValues;
	 }

	 /**
	  * 
	  * @param minHashValues
	  * @param numHashFunctions
	  * @return
	  */
	 private static double computeSimilarityFromSignatures(int[][] minHashValues, int numHashFunctions) {
		 int identicalMinHashes = 0;
		 for (int i = 0; i < numHashFunctions; i++){
			 if (minHashValues[0][i] == minHashValues[1][i]) {
				 identicalMinHashes++;
			 }
		 }
		 return (1.0 * identicalMinHashes) / numHashFunctions;
	 }

	 /**
	  * 
	  * @param x
	  * @param a
	  * @param b
	  * @param c
	  * @return
	  */
	 private static int hash(int x, int a, int b, int c) {
		 int hashValue = (int)((a * (x >> 4) + b * x + c) & 131071);
		 return Math.abs(hashValue);
	 }



	 private void computeMinHashForSet(Set<T> set, int setIndex, int[][] minHashValues, Map<T, boolean[]> bitArray){
		 int index=0;

		 for(T element : bitArray.keySet()) { // for every element in the bit array
			 for (int i = 0; i < numHash; i++){ // for every hash
				 if(set.contains(element)) { // if the set contains the element
					 int hindex = hash[index]; // get the hash
					 if (hindex < minHashValues[setIndex][index]) { 
						 // if current hash is smaller than the existing hash in the slot then replace with the smaller hash value
						 minHashValues[setIndex][index] = hindex;
					 }
				 }
			 }
			 index++;
		 }
	 }

	 /**
	  * 
	  * @param set1
	  * @param set2
	  * @return
	  */
	 public Map<T,boolean[]> buildBitMap(Set<T> set1, Set<T> set2){

		 Map<T,boolean[]> bitArray = new HashMap<T,boolean[]>();

		 for(T t : set1){
			 bitArray.put(t, new boolean[]{true,false});
		 }

		 for(T t : set2){
			 if(bitArray.containsKey(t)){
				 // item is not present in set1
				 bitArray.put(t, new boolean[]{true,true});
			 }else if(!bitArray.containsKey(t)){
				 // item is not present in set1
				 bitArray.put(t, new boolean[]{false,true});
			 }
		 }


		 return bitArray;
	 }


	 Random r = new Random();
	 private int bmax;
	 private int hashCodeSizeDiff;
	 private int hashCount=200;
	 int[] randoms = new int[hashCount];

	 public void initRandom(){	
		 int machineWordSize = Integer.SIZE;
		 int hashCodeSize = machineWordSize/2;
		 int hashCodeSizeDiff = machineWordSize - hashCodeSize;
		 bmax  = (1 << hashCodeSizeDiff);
		 for (int i=0; i<hashCount; i++){
			 randoms[i]=r.nextInt();
		 }
	 }

	 public int getMinHash(Set<String> set, int hcr){
		 int hashCode=Integer.MAX_VALUE;
		 for (String s: set){
			 int hstart=s.hashCode();
			 int hc = (((hstart * (hcr*2+1)) + randoms[hcr]) >> hashCodeSizeDiff);
			 if (hc < hashCode)
				 hashCode=hc;
		 }
		 return hashCode;		
	 }

	 public int[] getSignature(Set<String> set){
		 int[] sign = new int[200];
		 for (int i=0;i<200;i++){
			 sign[i]=getMinHash(set, i);
		 }
		 return sign;
	 }
	 
	 private static double computeSimilarityFromSignatures(int[] minHashValues1, int[] minHashValues2) {
		 int identicalMinHashes = 0;
		 for (int i = 0; i < 200; i++){
			 if (minHashValues1[i] == minHashValues2[i]) {
				 identicalMinHashes++;
			 }
		 }
		 return (1.0 * identicalMinHashes) / 200;
	 }
	 
	 public static Set<String> shingle(String [] list){
		 Set<String> returnn = new HashSet<String>();
		 for (int i=0; i<list.length-NGRAMS; i++){
			 StringBuilder sb = new StringBuilder();
			 for (int j=i; j<i+NGRAMS; j++){
				 sb.append(list[j]);
			 }
			 returnn.add(sb.toString());
		 }
		 return returnn;
	 }

	 public static void main(String[] args){
		 String one = "Thank you for registering with my easyJet. Welcome, we look forward to seeing you on board soon and getting you set for takeoff. Keep an eye on your inbox too as we'll be dropping a quick hello over the next few days with some of our latest, greatest fares. Or visit us online now at easyjet.com Now that you have registered you will be able to manage all your easyJet.com online bookings. Here's what your membership profile currently looks like";
		 Set<String> set1 = shingle(WordList.split(one));
		 String two = "Thank you for registering with my telstra. Welcome, we look forward to seeing you online soon and getting you set with my tv. Keep an eye on your inbox too as we'll be dropping a quick hello over the next few days with some of our latest, greatest fares. Or visit us online now at easyjet.com Now that you have registered you will be able to manage all your telstra services. Here's what your membership profile currently looks like";
		 Set<String> set2 = shingle(WordList.split(two));		
		
		 
		 MinHash<String> minHash = new MinHash<String>(set1.size()+set2.size());
		 System.out.println(minHash.similarity(set1, set2));
		 
		 minHash.initRandom();
		 int[] ones = minHash.getSignature(set1);
		 int[] twos = minHash.getSignature(set2);
		 System.out.println(minHash.computeSimilarityFromSignatures(ones, twos));
		 
	 }
}