001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.universal; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.util.tokens.Token; 023import org.tribuo.util.tokens.Tokenizer; 024 025import java.util.Arrays; 026import java.util.LinkedList; 027import java.util.Queue; 028 029/** 030 * This class was originally written for the purpose of document indexing in an 031 * information retrieval context (principally used in Sun Labs' Minion search 032 * engine). It was refactored here to implement the Tokenizer interface taking 033 * care that the the 'ngram' tokens had correct character offsets. This is 034 * typically not required in the document indexing context - but is essential 035 * in other kinds of text processing / NLP tasks. 036 * <p> 037 * This tokenizer has some specific behavior in how it handles "ngram" 038 * characters - i.e., those characters for which {@link #isNgram(char)} returns 039 * true (CJK characters and others). For these characters, it will generate 040 * tokens corresponding to character bigrams in addition to tokens corresponding 041 * to token unigrams. Most of the other tokenizers will generate tokens that 042 * have no overlapping spans but here the character bigram tokens will overlap 043 * with the character unigram tokens. 044 * <p> 045 * This tokenizer uses bigram tokenization whenever it encounters 'ngram' 046 * characters in the CJK range (among others see {@link #isNgram(char)}). It 047 * otherwise tokenizes using punctuation and whitespace separators to separate 048 * words. Within runs of 'ngram' characters the tokenizer will generate tokens 049 * corresponding to two adjacent characters in addition to tokens corresponding 050 * to each character. The tokens corresponding to character bigrams may overlap 051 * with the previous and next token. An end-of-line between two 'ngram' 052 * characters is ignored (i.e., a character bigram token will be created.) 053 * <p> 054 * For example, a sequence of three Chinese characters, 非常感, would tokenize as 055 * three WORD type tokens: 非, 常, and 感 and two NGRAM type tokens: 非常 and 常感. 056 * Here these tokens will have character offsets that correspond to the 057 * character offsets into the text. Here are the tokens listed with their 058 * character offsets: 059 * <ul> 060 * <li>非[0,1]</li> 061 * <li>非常[0,2]</li> 062 * <li>常[1,2]</li> 063 * <li>常感[1,3]</li> 064 * <li>感[2,3]</li> 065 * </ul> 066 */ 067public class UniversalTokenizer implements Tokenizer { 068 069 /** 070 * The length of the longest token that we will generate. 071 */ 072 protected int maxTokenLength = 256; 073 private boolean eofReached = false; 074 075 /** 076 * The character position in the character sequence that we're tokenizing. 077 */ 078 private int pos; 079 080 /** 081 * The starting offset of the current buffer in the token stream. 082 */ 083 private int start; 084 085 /** 086 * If <code>true</code> then unigrams will be generated for each n-gram 087 * character. 088 */ 089 private boolean generateUnigrams = true; 090 091 /** 092 * If <code>true</code> then character bigrams will be generated for each n-gram 093 * character as defined by {@link #isNgram(char)}. 094 */ 095 private boolean generateNgrams = true; 096 /** 097 * The state of the tokenizer determined by previous history. 098 */ 099 private State state; 100 /** 101 * The character sequence that we're currently processing. 102 */ 103 private CharSequence cs; 104 /** 105 * The token that we're building. 106 */ 107 private char[] buffer; 108 /** 109 * A string representation of the current token. 110 */ 111 private String currToken; 112 /** 113 * The current type of the token. 114 */ 115 private Token.TokenType currType; 116 /** 117 * The current word position of the token. 118 */ 119 private int currPos; 120 /** 121 * The starting offset of the current token. 122 */ 123 private int startOffset; 124 /** 125 * The ending offset of the current token. 126 */ 127 private int endOffset; 128 /** 129 * The length of the current token we're building. 130 */ 131 private int tokenLength; 132 /** 133 * Whether this is the first token. 134 */ 135 private boolean firstToken; 136 /** 137 * Is the tokenizer ready? 138 */ 139 private boolean ready; 140 @Config(description="Send punctuation through as tokens.") 141 private boolean sendPunct = false; 142 /** 143 * A set of tokens that were generated and need to be returned. 144 */ 145 private Queue<Range> queuedTokens; 146 private Queue<Range> pool; 147 /** 148 * The current character that we're processing. 149 */ 150 private char c; 151 152 /** 153 * Constructs a universal tokenizer. 154 * @param sendPunct if sendPunct is true, then the tokenizer will generate punctuation tokens. 155 */ 156 public UniversalTokenizer(boolean sendPunct) { 157 super(); 158 this.sendPunct = sendPunct; 159 this.buffer = new char[maxTokenLength]; 160 this.tokenLength = 0; 161 this.state = State.SKIPPING; 162 this.queuedTokens = new LinkedList<>(); 163 this.pool = new LinkedList<>(); 164 } 165 166 /** 167 * Constructs a universal tokenizer which doesn't send punctuation. 168 */ 169 public UniversalTokenizer() { 170 this(false); 171 } 172 173 /** 174 * A quick check for whether a character should be kept in a word or should 175 * be removed from the word if it occurs at one of the ends. An 176 * approximation of Character.isLetterOrDigit, but is faster and more 177 * correct, since it doesn't count the smart quotes as letters. 178 * 179 * @param c The character to check. 180 * @return True if the input is a letter or digit. 181 */ 182 public static boolean isLetterOrDigit(char c) { 183 if ((c <= 122 && c >= 97) 184 || // most frequent: lowercase a...z 185 (c <= 90 && c >= 65) 186 || // frequent: uppercase A...Z 187 (c <= 57 && c >= 48) // frequent: numbers 0...9 188 ) { 189 return true; 190 } else if ((c <= 96) 191 || // includes whitespace 192 (c == 210 || c == 211) 193 || // (smart quotes) 194 (c >= 123 && c <= 127) // {|}~DEL 195 ) { 196 return false; 197 } else if ((c >= 3021 && c <= 3029) 198 || // Hangzhou-style numerals 199 (c >= 65 && c <= 90) 200 || // frequent: uppercase A...Z 201 (c >= 48 && c <= 57) // frequent: numbers 0...9 202 ) { 203 return true; 204 } else { 205 return Character.isLetterOrDigit(c); 206 } 207 } 208 209 /** 210 * A quick check for whether a character is a digit. 211 * 212 * @param c The character to check 213 * @return True if the input is a digit. 214 */ 215 public static boolean isDigit(char c) { 216 if ((c <= 57 && c >= 48) // most frequent: ASCII numbers 0...9 217 ) { 218 return true; 219 } else if (c <= 255) { 220 return false; 221 } else { 222 return Character.isDigit(c); 223 } 224 } 225 226 /** 227 * A quick check for whether a character is whitespace. 228 * 229 * @param c The character to check 230 * @return True if the input is a whitespace character. 231 */ 232 public static boolean isWhitespace(char c) { 233 //test for white space 234 if ((c == 32) 235 || // Space 236 (c <= 13 && c >= 9) 237 || // Tab, Linefeed, PageUp, Page, Return 238 (c <= 4 && c >= 1) // STX, SOT, ETX (Enter), EOT 239 ) { 240 return true; 241 } else if (c <= 255) { 242 return false; 243 } else { 244 return Character.isWhitespace(c); 245 } 246 } 247 248 /** 249 * A quick check for a character in a language that may not separate words 250 * with whitespace (includes Arabic, CJK, and Thai). Uses Unicode Standard 251 * Version 2.0. 252 * 253 * @param c The character to check 254 * @return True if the input character is in a region which is not whitespace separated. 255 */ 256 public static boolean isNgram(char c) { 257 // Test for characters that may not separate words with white 258 // space and therefore require bigram treatment. 259 // Uses Unicode Standard Version 2.0. 260 if (c > '\u3002' && c <= '\uD7FF') { // (CJK Characters) 261 return (c < '\u3040' || c > '\u30FF'); // - Hiragana and Katakana 262 } else if ((c >= '\u0600' && c <= '\u06FF') || // (Arabic) 263 (c >= '\uF900' && c <= '\uFAFF') || // (CJK Compatibility Ideographs) 264 (c >= '\u1100' && c <= '\u11FF') || // (Hangul Jamo) 265 (c >= '\uFB50' && c <= '\uFE2F') || // (Arabic Presentation Forms-A) 266 (c >= '\uFE30' && c <= '\uFE4F') || // (CJK Compatibility Forms) 267 (c >= '\uFE70' && c <= '\uFEFF') || // (Arabic Presentation Forms-B) 268 (c >= '\uFF60' && c <= '\uFFDF') || // (CJK Half Width Forms) 269 (c >= '\u0E00' && c <= '\u0E7F') || // (Thai) 270 (c >= '\u0E80' && c <= '\u0EFF') || // (Lao) 271 (c >= '\u0F00' && c <= '\u0FBF') || // (Tibetan) 272 (c >= '\u0B80' && c <= '\u0BFF') || // (Tamil) 273 (c >= '\u0C00' && c <= '\u0C7F') || // (Telugu) 274 (c >= '\u0C80' && c <= '\u0CFF') || // (Kannada) 275 (c >= '\u0D00' && c <= '\u0D7F') || // (Malayalam) 276 (c >= '\u10A0' && c <= '\u10FF')) { // (Georgian) 277 return true; 278 } else { 279 return false; 280 } 281 } 282 283 /** 284 * Does this tokenizer generate unigrams? 285 * @return True if the tokenizer generates unigram tokens. 286 */ 287 public boolean isGenerateUnigrams() { 288 return generateUnigrams; 289 } 290 291 /** 292 * Controls if the tokenizer generates unigrams. 293 * @param generateUnigrams If true generates unigram tokens. 294 */ 295 public void setGenerateUnigrams(boolean generateUnigrams) { 296 this.generateUnigrams = generateUnigrams; 297 } 298 299 /** 300 * Does this tokenizer generate ngrams? 301 * @return True if the tokenizer generates ngram tokens. 302 */ 303 public boolean isGenerateNgrams() { 304 return generateNgrams; 305 } 306 307 /** 308 * Controls if the tokenizer generates ngrams. 309 * @param generateNgrams If true generates ngram tokens. 310 */ 311 public void setGenerateNgrams(boolean generateNgrams) { 312 this.generateNgrams = generateNgrams; 313 } 314 315 /** 316 * Returns the maximum token length this tokenizer will generate. 317 * @return The maximum token length. 318 */ 319 public int getMaxTokenLength() { 320 return maxTokenLength; 321 } 322 323 /** 324 * Sets the maximum token length this tokenizer will generate. 325 * @param maxTokenLength The maximum token length. 326 */ 327 public void setMaxTokenLength(int maxTokenLength) { 328 this.maxTokenLength = maxTokenLength; 329 } 330 331 @Override 332 public ConfiguredObjectProvenance getProvenance() { 333 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 334 } 335 336 @Override 337 public final boolean advance() { 338 if (cs == null) { 339 throw new IllegalStateException("UniversalTokenizer has not been reset."); 340 } 341 // 342 // Do we have tokens queued up to go? 343 if (queuedTokens.size() > 0) { 344 handleQueued(); 345 return true; 346 } 347 348 // 349 // If we've already read the data, then we're done. 350 if (eofReached) { 351 return false; 352 } 353 354 // 355 // Read characters until we have one or more tokens to send. 356 while (pos < cs.length()) { 357 c = cs.charAt(pos); 358 handleChar(); 359 pos++; 360 if (queuedTokens.size() > 0) { 361 handleQueued(); 362 return true; 363 } 364 } 365 366 eofReached = true; 367 makeTokens(); 368 if (queuedTokens.size() > 0) { 369 handleQueued(); 370 return true; 371 } 372 return false; 373 } 374 375 private void handleQueued() { 376 ready = true; 377 Range range = queuedTokens.poll(); 378 currToken = new String(range.buff, 0, range.len); 379 startOffset = range.start; 380 endOffset = range.end; 381 if (firstToken && range.incr == 0) { 382 range.incr = 1; 383 firstToken = false; 384 } 385 currType = range.type; 386 currPos = range.incr; 387 pool.offer(range); 388 } 389 390 /** 391 * Handle a character to add to the token buffer. 392 */ 393 protected void handleChar() { 394 395 // 396 // ASCII characters. 397 if ((c >= 97 && c <= 122) || (c >= 65 && c <= 90)) { 398 if (state == State.NGRAM) { 399 makeTokens(); 400 } 401 addChar(); 402 state = State.COLLECTING; 403 return; 404 } 405 406 // 407 // ASCII space. We need to treat other whitespace differently, depending 408 // on whether we're ngram tokenizing. 409 if (c == 32) { 410 switch (state) { 411 case COLLECTING: 412 case NGRAM: 413 // The transition from collecting or n-gram to whitespace 414 // causes us to emit tokens. 415 makeTokens(); 416 break; 417 case SKIPPING: 418 break; 419 default: 420 break; 421 } 422 sendPunct(); 423 state = State.SKIPPING; 424 return; 425 } 426 427 if (isNgram(c)) { 428 // CJK characters (Chinese, Japanese, Korean) 429 // to be tokenized with bigram tokens. 430 // (Put this test here so these languages will tokenize 431 // more efficiently and it doesn't cost much for the non CJK 432 // languages.) 433 434 switch (state) { 435 case SKIPPING: 436 state = State.NGRAM; 437 break; 438 case COLLECTING: 439 makeTokens(); 440 state = State.NGRAM; 441 break; 442 case NGRAM: 443 break; 444 default: 445 break; 446 } 447 addChar(); 448 return; 449 } 450 451 if (c == 0 || (state == State.NGRAM && (c >= 10 && c <= 13))) { 452 // While processing ngram character regions, Linefeed, PageUp, Page, Return 453 // don't do anything, so just return. 454 return; 455 } 456 457 if (isWhitespace(c)) { 458 // The rest of the white space characters for break: 459 switch (state) { 460 case COLLECTING: 461 case NGRAM: 462 // The transition from collecting to whitespace 463 // causes us to emit tokens. 464 makeTokens(); 465 break; 466 case SKIPPING: 467 break; 468 default: 469 break; 470 } 471 sendPunct(); 472 state = State.SKIPPING; 473 return; 474 } 475 476 if ((c >= 48 && c <= 57) || (c > 255 && Character.isDigit(c))) { 477 478 // 479 // The digits. 480 switch (state) { 481 case SKIPPING: 482 state = State.COLLECTING; 483 break; 484 case NGRAM: 485 makeTokens(); 486 state = State.COLLECTING; 487 break; 488 case COLLECTING: 489 break; 490 default: 491 break; 492 } 493 addChar(); 494 return; 495 } 496 497 // 498 // Any other letter or digit. 499 if (isLetterOrDigit(c)) { 500 if (state == State.NGRAM) { 501 makeTokens(); 502 } 503 addChar(); 504 state = State.COLLECTING; 505 return; 506 } 507 508 // Anything other than the above cases, we break. 509 if (state != State.SKIPPING) { 510 makeTokens(); 511 } 512 sendPunct(); 513 state = State.SKIPPING; 514 } 515 516 private void sendPunct() { 517 if (sendPunct && !isWhitespace(c)) { 518 Range r = getRange(); 519 r.punct(c, pos); 520 queuedTokens.add(r); 521 } 522 } 523 524 /** 525 * Add a character to the buffer that we're building for a token. 526 */ 527 protected void addChar() { 528 529 // 530 // First see if token buffer needs to be expanded. 531 // Note: tokLen points to the next unused slot in token. 532 if (buffer.length <= tokenLength) { 533 buffer = Arrays.copyOf(buffer, tokenLength + 32); 534 } 535 536 if (tokenLength == 0) { 537 start = pos; 538 } 539 buffer[tokenLength++] = c; 540 541 if (tokenLength >= maxTokenLength) { 542 makeTokens(); 543 } 544 } 545 546 @Override 547 public int getStart() { 548 if (ready) { 549 return startOffset; 550 } else { 551 throw new IllegalStateException("UniversalTokenizer is not ready."); 552 } 553 } 554 555 @Override 556 public int getEnd() { 557 if (ready) { 558 return endOffset; 559 } else { 560 throw new IllegalStateException("UniversalTokenizer is not ready."); 561 } 562 } 563 564 @Override 565 public String getText() { 566 if (ready) { 567 return currToken; 568 } else { 569 throw new IllegalStateException("UniversalTokenizer is not ready."); 570 } 571 } 572 573 @Override 574 public Token.TokenType getType() { 575 if (ready) { 576 return currType; 577 } else { 578 throw new IllegalStateException("UniversalTokenizer is not ready."); 579 } 580 } 581 582 /** 583 * Gets the current position in the input. 584 * @return The current position. 585 */ 586 public int getPos() { 587 return currPos; 588 } 589 590 @Override 591 public Tokenizer clone() { 592 try { 593 UniversalTokenizer copy = (UniversalTokenizer) super.clone(); 594 595 copy.buffer = new char[maxTokenLength]; 596 copy.tokenLength = 0; 597 copy.state = State.SKIPPING; 598 copy.pool = new LinkedList<>(); 599 copy.queuedTokens = new LinkedList<>(); 600 copy.currToken = null; 601 copy.ready = false; 602 copy.cs = null; 603 604 return copy; 605 } catch (CloneNotSupportedException e) { 606 throw new AssertionError("UniversalTokenizer is Cloneable, but clone call failed"); 607 } 608 } 609 610 /** 611 * Reset state of tokenizer to clean slate. 612 */ 613 @Override 614 public void reset(CharSequence cs) { 615 this.cs = cs; 616 pos = 0; 617 tokenLength = 0; 618 start = -1; 619 state = State.SKIPPING; 620 eofReached = false; 621 firstToken = true; 622 c = 0; 623 startOffset = -1; 624 endOffset = -1; 625 currToken = null; 626 ready = false; 627 } 628 629 private Range getRange() { 630 if (pool.isEmpty()) { 631 return new Range(); 632 } 633 return pool.remove(); 634 } 635 636 /** 637 * Make one or more tokens from our current collected characters. 638 */ 639 protected void makeTokens() { 640 641 // 642 // Don't generate empty tokens. 643 if (tokenLength <= 0) { 644 return; 645 } 646 647 if (state == State.NGRAM) { 648 // if we only have one character, then just generate a single 649 // token and be done. 650 if (tokenLength == 1) { 651 Range range = getRange(); 652 range.set(buffer[0], start); 653 queuedTokens.add(range); 654 tokenLength = 0; 655 return; 656 } 657 658 for (int i = 0; i < tokenLength; i++) { 659 if (generateUnigrams) { 660 // Generate a unigram for this character. 661 Range range = getRange(); 662 range.set(buffer[i], start + i); 663 queuedTokens.add(range); 664 } 665 if (generateNgrams && i < tokenLength - 1) { 666 // Generate a bigram for this character. 667 Range range = getRange(); 668 range.set(buffer[i], buffer[i + 1], start + i); 669 queuedTokens.add(range); 670 } 671 } 672 } else { 673 // Generate one token from the buffer. 674 Range range = getRange(); 675 range.set(buffer, tokenLength, start); 676 queuedTokens.add(range); 677 } 678 tokenLength = 0; 679 } 680 681 private enum State { 682 SKIPPING, 683 COLLECTING, 684 NGRAM, 685 } 686 687}