001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.impl; 018 019import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 020import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 021import org.tribuo.util.tokens.Token; 022import org.tribuo.util.tokens.Tokenizer; 023 024/** 025 * This tokenizer is loosely based on the notion of word shape which is a common 026 * feature used in NLP. The idea here is that continuous runs of letters in the 027 * same character class will be grouped together. White space characters are 028 * used as delimiters. The character classes are: uppercase letters, lowercase 029 * letters, digits, and everything else goes into its own character class. So, 030 * for example, "1234abcd" would be split into "1234" and "abcd". And "!@#$" 031 * would result in four tokens. Please see unit tests. 032 * <p> 033 * Strings are split according to whitespace and contiguous runs of characters 034 * in the same character classes. Except for one exception - if uppercase 035 * letters are immediately followed by lowercase letters, then we keep them 036 * together. This has the effect of recognizing camel case and splits 037 * "CamelCase" into "Camel" and "Case". It also splits "ABCdef AAbb" into 038 * "ABCdef" and "AAbb". 039 */ 040public class ShapeTokenizer implements Tokenizer { 041 042 private String cs; 043 044 private int pos; 045 046 private String token; 047 048 private StringBuilder tb = new StringBuilder(); 049 050 private int start; 051 052 private int end; 053 054 private char currClass; 055 056 private int prevClass; 057 058 private boolean ready; 059 060 /** 061 * Constructs a ShapeTokenizer. 062 */ 063 public ShapeTokenizer() { } 064 065 @Override 066 public ConfiguredObjectProvenance getProvenance() { 067 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 068 } 069 070 @Override 071 public void reset(CharSequence cs) { 072 this.cs = cs.toString(); 073 pos = 0; 074 start = -1; 075 end = -1; 076 prevClass = -1; 077 token = null; 078 ready = false; 079 } 080 081 private char getClass(int cp) { 082 if (Character.isUpperCase(cp)) { 083 return 'A'; 084 } else if (Character.isLowerCase(cp)) { 085 return 'a'; 086 } else if (Character.isDigit(cp)) { 087 return '1'; 088 } else if (Character.isWhitespace(cp)) { 089 return ' '; 090 } else { 091 return (char) cp; 092 } 093 } 094 095 @Override 096 public boolean advance() { 097 if (cs == null) { 098 throw new IllegalStateException("ShapeTokenizer has not been reset."); 099 } 100 tb.delete(0, tb.length()); 101 start = pos; 102 while (pos < cs.length()) { 103 int cp = cs.codePointAt(pos); 104 int lcp = Character.charCount(cp); 105 106 currClass = getClass(cp); 107 108 // 109 // Skip spaces at the start of the token. 110 if (tb.length() == 0 && currClass == ' ') { 111 pos += lcp; 112 start = pos; 113 prevClass = currClass; 114 continue; 115 } 116 117 // 118 // When do we want to end the current token? When we cross a boundary 119 // between token classes when we're not at the start of the string, 120 // except when that boundary is between 121 // upper and lower case characters. 122 if (currClass != prevClass && prevClass != -1) { 123 if (!(prevClass == 'A' && currClass == 'a')) { 124 if (tb.length() > 0) { 125 token = tb.toString(); 126 prevClass = currClass; 127 // 128 // Note that we're not increasing pos here: we want 129 // to work on this current character the next time that 130 // we get called! 131 ready = true; 132 return true; 133 } 134 } 135 } 136 137 // 138 // We didn't end the token, so collect the current character, 139 // unless it's a space! 140 if (currClass != ' ') { 141 tb.appendCodePoint(cp); 142 } 143 prevClass = currClass; 144 pos += lcp; 145 end = pos; 146 } 147 148 if (tb.length() > 0) { 149 token = tb.toString(); 150 ready = true; 151 return true; 152 } 153 154 return false; 155 } 156 157 @Override 158 public String getText() { 159 if (ready) { 160 return token; 161 } else { 162 throw new IllegalStateException("ShapeTokenizer is not ready."); 163 } 164 } 165 166 @Override 167 public int getStart() { 168 if (ready) { 169 return start; 170 } else { 171 throw new IllegalStateException("ShapeTokenizer is not ready."); 172 } 173 } 174 175 @Override 176 public int getEnd() { 177 if (ready) { 178 return end; 179 } else { 180 throw new IllegalStateException("ShapeTokenizer is not ready."); 181 } 182 } 183 184 @Override 185 public Token.TokenType getType() { 186 if (ready) { 187 return Token.TokenType.WORD; 188 } else { 189 throw new IllegalStateException("ShapeTokenizer is not ready."); 190 } 191 } 192 193 @Override 194 public ShapeTokenizer clone() { 195 try { 196 ShapeTokenizer copy = (ShapeTokenizer) super.clone(); 197 copy.tb = new StringBuilder(); 198 copy.ready = false; 199 copy.cs = null; 200 return copy; 201 } catch (CloneNotSupportedException e) { 202 throw new AssertionError("ShapeTokenizer is Cloneable, but clone call failed"); 203 } 204 } 205 206}