001/* 002 * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.impl; 018 019import org.tribuo.util.tokens.Token; 020import org.tribuo.util.tokens.Token.TokenType; 021import org.tribuo.util.tokens.Tokenizer; 022 023/** 024 * This class supports character-by-character (that is, codepoint-by-codepoint) 025 * iteration over input text to create tokens. Extensions of this class are 026 * initialized with a {@link SplitFunction} which will be called for each character and 027 * a {@link SplitResult} consisting of a {@link SplitType} and a {@link TokenType} will be returned. 028 * Tokenization is achieved based on the {@link SplitResult} returned for each 029 * character. Please see notes below for each {@link SplitType} and {@link SplitResult}. 030 */ 031public abstract class SplitFunctionTokenizer implements Tokenizer { 032 033 /** 034 * Defines different ways that a tokenizer can split the input text at a given character. 035 */ 036 public enum SplitType { 037 /** 038 * the current character is added to the in-progress token (i.e. do not split on 039 * the current character) 040 */ 041 NO_SPLIT, 042 /** 043 * The current character will cause the in-progress token to be completed. the 044 * current character will not be included in any returned token and the token 045 * type of the corresponding SplitResult is ignored (See {@link SplitResult#SPLIT_AT}). 046 * This SplitType may be useful for whitespace. 047 */ 048 SPLIT_AT, 049 /** 050 * The current character will cause the in-progress token to be completed the 051 * current character will be included in the next token. The token type of the 052 * corresponding SplitResult is ignored (See {@link SplitResult#SPLIT_BEFORE}). This 053 * SplitType may be useful for e.g. capitalized letters when CamelCase splitting 054 * of digits when separating out a currency symbol. 055 */ 056 SPLIT_BEFORE, 057 /** 058 * The current character will cause the in-progress token to be completed after 059 * the current character is appended to the in-progress token. The token type of 060 * the created token (that includes the current character) will be assigned the 061 * type included with the {@link SplitResult}. 062 */ 063 SPLIT_AFTER, 064 /** 065 * The current character should cause the in-progress token to be completed. The 066 * token assigned to the in-progress token will be whatever was previously 067 * assigned to the previous character. This token will be followed by a second 068 * single-character token consisting of the current character. The token type 069 * assigned to this second token will be provided with the {@link SplitResult}. 070 */ 071 SPLIT_BEFORE_AND_AFTER 072 } 073 074 /** 075 * A combination of a {@link SplitType} and a {@link TokenType}. The TokenType of some 076 * SplitResult values are ignored and so not every combination of SplitType and 077 * TokenType is provided. For example, {@link SplitType#SPLIT_AT} and 078 * {@link SplitType#SPLIT_BEFORE} (as described above) create tokens whose types have 079 * already been determined. 080 */ 081 public enum SplitResult { 082 /** 083 * Not a split, is a word. 084 */ 085 NO_SPLIT_WORD(SplitType.NO_SPLIT, TokenType.WORD), 086 /** 087 * Not a split, is a ngram. 088 */ 089 NO_SPLIT_NGRAM(SplitType.NO_SPLIT, TokenType.NGRAM), 090 /** 091 * Not a split, is punctuation. 092 */ 093 NO_SPLIT_PUNCTUATION(SplitType.NO_SPLIT, TokenType.PUNCTUATION), 094 /** 095 * Not a split, is whitespace. 096 */ 097 NO_SPLIT_WHITESPACE(SplitType.NO_SPLIT, TokenType.WHITESPACE), 098 /** 099 * Not a split, is a prefix. 100 */ 101 NO_SPLIT_PREFIX(SplitType.NO_SPLIT, TokenType.PREFIX), 102 /** 103 * Not a split, is a suffix. 104 */ 105 NO_SPLIT_SUFFIX(SplitType.NO_SPLIT, TokenType.SUFFIX), 106 /** 107 * Not a split, is infix. 108 */ 109 NO_SPLIT_INFIX(SplitType.NO_SPLIT, TokenType.INFIX), 110 /** 111 * Not a split, is unknown. 112 */ 113 NO_SPLIT_UNKNOWN(SplitType.NO_SPLIT, TokenType.UNKNOWN), 114 /** 115 * Split at. 116 */ 117 SPLIT_AT(SplitType.SPLIT_AT, TokenType.WORD), //the token type is ignored 118 /** 119 * Split before. 120 */ 121 SPLIT_BEFORE(SplitType.SPLIT_BEFORE, TokenType.WORD), //the token type is ignored 122 /** 123 * Split after a word. 124 */ 125 SPLIT_AFTER_WORD(SplitType.SPLIT_AFTER, TokenType.WORD), 126 /** 127 * Split after a ngram. 128 */ 129 SPLIT_AFTER_NGRAM(SplitType.SPLIT_AFTER, TokenType.NGRAM), 130 /** 131 * Split after punctuation. 132 */ 133 SPLIT_AFTER_PUNCTUATION(SplitType.SPLIT_AFTER, TokenType.PUNCTUATION), 134 /** 135 * Split after whitespace. 136 */ 137 SPLIT_AFTER_WHITESPACE(SplitType.SPLIT_AFTER, TokenType.WHITESPACE), 138 /** 139 * Split after a prefix. 140 */ 141 SPLIT_AFTER_PREFIX(SplitType.SPLIT_AFTER, TokenType.PREFIX), 142 /** 143 * Split after a suffix. 144 */ 145 SPLIT_AFTER_SUFFIX(SplitType.SPLIT_AFTER, TokenType.SUFFIX), 146 /** 147 * Split after infix. 148 */ 149 SPLIT_AFTER_INFIX(SplitType.SPLIT_AFTER, TokenType.INFIX), 150 /** 151 * Split after an unknown value. 152 */ 153 SPLIT_AFTER_UNKNOWN(SplitType.SPLIT_AFTER, TokenType.UNKNOWN), 154 /** 155 * Split before and after a word. 156 */ 157 SPLIT_BEFORE_AND_AFTER_WORD(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.WORD), 158 /** 159 * Split before and after a ngram. 160 */ 161 SPLIT_BEFORE_AND_AFTER_NGRAM(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.NGRAM), 162 /** 163 * Split before and after punctuation. 164 */ 165 SPLIT_BEFORE_AND_AFTER_PUNCTUATION(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.PUNCTUATION), 166 /** 167 * Split before and after whitespace. 168 */ 169 SPLIT_BEFORE_AND_AFTER_WHITESPACE(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.WHITESPACE), 170 /** 171 * Split before and after prefix. 172 */ 173 SPLIT_BEFORE_AND_AFTER_PREFIX(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.PREFIX), 174 /** 175 * Split before and after suffix. 176 */ 177 SPLIT_BEFORE_AND_AFTER_SUFFIX(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.SUFFIX), 178 /** 179 * Split before and after infix. 180 */ 181 SPLIT_BEFORE_AND_AFTER_INFIX(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.INFIX), 182 /** 183 * Split before and after unknown. 184 */ 185 SPLIT_BEFORE_AND_AFTER_UNKNOWN(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.UNKNOWN); 186 187 /** 188 * The split type. 189 */ 190 public final SplitType splitType; 191 /** 192 * The token type. 193 */ 194 public final TokenType tokenType; 195 196 SplitResult(SplitType splitType, TokenType tokenType) { 197 this.splitType = splitType; 198 this.tokenType = tokenType; 199 } 200 } 201 202 /** 203 * An interface for checking if the text should be split at the supplied codepoint. 204 */ 205 @FunctionalInterface 206 public static interface SplitFunction { 207 /** 208 * Applies the split function. 209 * @param codepoint The codepoint to check. 210 * @param index The character index. 211 * @param cs The sequence that's being split. 212 * @return How the sequence should be split. 213 */ 214 public SplitResult apply(int codepoint, int index, CharSequence cs); 215 } 216 217 /** 218 * The splitting function. 219 */ 220 protected SplitFunction splitFunction; 221 222 /** 223 * Constructs a tokenizer, used by OLCUT. 224 */ 225 protected SplitFunctionTokenizer() { } 226 227 /** 228 * Creates a new tokenizer using the supplied split function. 229 * @param splitFunction The split function. 230 */ 231 public SplitFunctionTokenizer(SplitFunction splitFunction) { 232 super(); 233 this.splitFunction = splitFunction; 234 } 235 236 private String cs; 237 238 private int start; 239 240 private int p; 241 242 private StringBuilder tokenSb = new StringBuilder(); 243 244 private TokenType currentType = TokenType.WORD; 245 246 private Token currentToken; 247 248 private Token nextToken; 249 250 private boolean ready; 251 252 @Override 253 public void reset(CharSequence cs) { 254 this.cs = cs.toString(); 255 start = 0; 256 p = 0; 257 tokenSb.delete(0, tokenSb.length()); 258 ready = false; 259 } 260 261 @Override 262 public boolean advance() { 263 if (cs == null) { 264 throw new IllegalStateException("SplitFunctionTokenizer has not been reset."); 265 } 266 if (nextToken != null) { 267 currentToken = nextToken; 268 nextToken = null; 269 return true; 270 } 271 if (p >= cs.length()) { 272 return false; 273 } 274 275 currentToken = null; 276 277 SplitResult splitResult; 278 SplitType splitType; 279 TokenType tokenType; 280 281 tokenSb.delete(0, tokenSb.length()); 282 while (p < cs.length()) { 283 int codepoint = cs.codePointAt(p); 284 splitResult = splitFunction.apply(codepoint, p, cs); 285 splitType = splitResult.splitType; 286 tokenType = splitResult.tokenType; 287 // If we want to keep it, then go ahead and do that and remember 288 // where the end of the token is. 289 if (splitType == SplitType.NO_SPLIT) { 290 if (tokenSb.length() == 0) { 291 start = p; 292 } 293 p += Character.charCount(codepoint); 294 tokenSb.appendCodePoint(codepoint); 295 currentType = tokenType; 296 continue; 297 } 298 299 if (splitType == SplitType.SPLIT_AT) { 300 if (tokenSb.length() > 0) { 301 currentToken = new Token(tokenSb.toString(), start, p, currentType); 302 } 303 p += Character.charCount(codepoint); 304 start = p; 305 tokenSb.delete(0, tokenSb.length()); 306 } else if (splitType == SplitType.SPLIT_BEFORE) { 307 if (tokenSb.length() > 0) { 308 currentToken = new Token(tokenSb.toString(), start, p, currentType); 309 } 310 start = p; 311 tokenSb.delete(0, tokenSb.length()); 312 tokenSb.appendCodePoint(codepoint); 313 p += Character.charCount(codepoint); 314 } else if (splitType == SplitType.SPLIT_AFTER) { 315 p += Character.charCount(codepoint); 316 tokenSb.appendCodePoint(codepoint); 317 // no need to check the length since we just added a code point 318 currentToken = new Token(tokenSb.toString(), start, p, tokenType); 319 tokenSb.delete(0, tokenSb.length()); 320 start = p; 321 } else if (splitType == SplitType.SPLIT_BEFORE_AND_AFTER) { 322 // wrap up the token we are currently building and then create 323 // the next token which consists of just the character 324 if (tokenSb.length() > 0) { 325 currentToken = new Token(tokenSb.toString(), start, p, currentType); 326 tokenSb.delete(0, tokenSb.length()); 327 start = p; 328 p += Character.charCount(codepoint); 329 tokenSb.appendCodePoint(codepoint); 330 nextToken = new Token(tokenSb.toString(), start, p, tokenType); 331 tokenSb.delete(0, tokenSb.length()); 332 } else { 333 start = p; 334 p += Character.charCount(codepoint); 335 tokenSb.appendCodePoint(codepoint); 336 currentToken = new Token(tokenSb.toString(), start, p, tokenType); 337 tokenSb.delete(0, tokenSb.length()); 338 } 339 } 340 if (currentToken != null) { 341 break; 342 } 343 } 344 345 if (currentToken == null) { 346 if (tokenSb.length() > 0) { 347 currentToken = new Token(tokenSb.toString(), start, p, currentType); 348 } 349 } 350 351 // We advanced if we have some stuff collected. 352 if (currentToken != null) { 353 ready = true; 354 return true; 355 } else { 356 return false; 357 } 358 } 359 360 @Override 361 public String getText() { 362 if (ready) { 363 return currentToken.text; 364 } else { 365 throw new IllegalStateException("SplitFunctionTokenizer is not ready."); 366 } 367 } 368 369 @Override 370 public int getStart() { 371 if (ready) { 372 return currentToken.start; 373 } else { 374 throw new IllegalStateException("SplitFunctionTokenizer is not ready."); 375 } 376 } 377 378 @Override 379 public int getEnd() { 380 if (ready) { 381 return currentToken.end; 382 } else { 383 throw new IllegalStateException("SplitFunctionTokenizer is not ready."); 384 } 385 } 386 387 @Override 388 public TokenType getType() { 389 return currentToken.type; 390 } 391 392 @Override 393 public Tokenizer clone() throws CloneNotSupportedException { 394 throw new UnsupportedOperationException( 395 "abstract class SplitFunctionTokenizer does not implement clone method. Subclasses must implement this method."); 396 } 397 398}