001/* 002 * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens; 018 019/** 020 * A single token extracted from a String. 021 * <p> 022 * Tokens are immutable, and may be records one day. 023 */ 024public class Token { 025 026 /** 027 * The token text. 028 */ 029 public final String text; 030 /** 031 * The start index. 032 */ 033 public final int start; 034 /** 035 * The end index. 036 */ 037 public final int end; 038 /** 039 * The token type. 040 */ 041 public final TokenType type; 042 043 /** 044 * Constructs a token. 045 * 046 * @param text should be equivalent to the substring of the original tokenized 047 * text for the given character offsets start and end 048 * @param start the starting offset of the token 049 * @param end the ending offset of the token (exclusive or inclusive?) 050 */ 051 public Token(String text, int start, int end) { 052 this(text, start, end, TokenType.WORD); 053 } 054 055 /** 056 * Constructs a token. 057 * 058 * @param text should be equivalent to the substring of the original tokenized 059 * text for the given character offsets start and end 060 * @param start the starting offset of the token 061 * @param end the ending offset of the token (exclusive or inclusive?) 062 * @param type the type of the token 063 */ 064 public Token(String text, int start, int end, TokenType type) { 065 this.text = text; 066 this.start = start; 067 this.end = end; 068 this.type = type; 069 } 070 071 /** 072 * The number of characters in this token. 073 * 074 * @return The number of characters. 075 */ 076 public int length() { 077 return this.end - this.start; 078 } 079 080 @Override 081 public String toString() { 082 return this.text + "[type=" + this.type + "," + this.start + "," + this.end + "]"; 083 } 084 085 /** 086 * Tokenizers may product multiple kinds of tokens, depending on the application 087 * to which they're being put. For example, when processing a document for 088 * highlighting during querying, we need to send through whitespace and 089 * punctuation so that the document looks as it did in it's original form. For 090 * most tokenizer applications, they will only send word tokens. 091 */ 092 public enum TokenType { 093 /** 094 * A WORD corresponds to a token that does not consist of or contain whitespace 095 * and may correspond to a regular "word" that could be looked up in a 096 * dictionary. Some tokenizers do not distinguish between different kinds of 097 * tokens and may use this as a default type for all generated tokens. 098 */ 099 WORD, 100 /** 101 * An NGRAM corresponds to a token that might correspond to a character ngram - 102 * i.e. some portion / sub-span of a regular word token (for example.) 103 */ 104 NGRAM, 105 /** 106 * A PUNCTUATION corresponds to tokens consisting of punctuation characters. In 107 * some applications, a PUNCTUATION may be treated differently because they may 108 * have less semantic content than regular word tokens. 109 */ 110 PUNCTUATION, 111 /** 112 * Some tokenizers may produce tokens corresponding to whitespace (e.g. space, 113 * tab, newline, etc.) It may be important for consumers of tokens generated by 114 * a tokenizer to ignore/skip WHITESPACE tokens to avoid unexpected behavior. 115 */ 116 WHITESPACE, 117 /** 118 * Some tokenizers produce "sub-word" tokens. A PREFIX corresponds to a sub-word 119 * word-prefix token. 120 */ 121 PREFIX, 122 /** 123 * Some tokenizers produce "sub-word" tokens. A SUFFIX corresponds to a sub-word 124 * word-suffix token. 125 */ 126 SUFFIX, 127 /** 128 * Some tokenizers produce "sub-word" tokens. An INFIX corresponds to a sub-word 129 * "infix" token (i.e. from the middle). 130 */ 131 INFIX, 132 /** 133 * Some tokenizers may work in concert with vocabulary data. Some applications 134 * may treat out-of-vocabulary tokens differently than other tokens. An UNKNOWN 135 * token corresponds to a token that is out-of-vocabulary or has never been seen 136 * before. 137 */ 138 UNKNOWN 139 } 140 141}