001/*
002 * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens;
018
019/**
020 * A single token extracted from a String.
021 * <p>
022 * Tokens are immutable, and may be records one day.
023 */
024public class Token {
025
026    /**
027     * The token text.
028     */
029    public final String text;
030    /**
031     * The start index.
032     */
033    public final int start;
034    /**
035     * The end index.
036     */
037    public final int end;
038    /**
039     * The token type.
040     */
041    public final TokenType type;
042
043    /**
044     * Constructs a token.
045     * 
046     * @param text  should be equivalent to the substring of the original tokenized
047     *              text for the given character offsets start and end
048     * @param start the starting offset of the token
049     * @param end   the ending offset of the token (exclusive or inclusive?)
050     */
051    public Token(String text, int start, int end) {
052        this(text, start, end, TokenType.WORD);
053    }
054
055    /**
056     * Constructs a token.
057     * 
058     * @param text  should be equivalent to the substring of the original tokenized
059     *              text for the given character offsets start and end
060     * @param start the starting offset of the token
061     * @param end   the ending offset of the token (exclusive or inclusive?)
062     * @param type  the type of the token
063     */
064    public Token(String text, int start, int end, TokenType type) {
065        this.text = text;
066        this.start = start;
067        this.end = end;
068        this.type = type;
069    }
070
071    /**
072     * The number of characters in this token.
073     * 
074     * @return The number of characters.
075     */
076    public int length() {
077        return this.end - this.start;
078    }
079
080    @Override
081    public String toString() {
082        return this.text + "[type=" + this.type + "," + this.start + "," + this.end + "]";
083    }
084
085    /**
086     * Tokenizers may product multiple kinds of tokens, depending on the application
087     * to which they're being put. For example, when processing a document for
088     * highlighting during querying, we need to send through whitespace and
089     * punctuation so that the document looks as it did in it's original form. For
090     * most tokenizer applications, they will only send word tokens.
091     */
092    public enum TokenType {
093        /**
094         * A WORD corresponds to a token that does not consist of or contain whitespace
095         * and may correspond to a regular "word" that could be looked up in a
096         * dictionary. Some tokenizers do not distinguish between different kinds of
097         * tokens and may use this as a default type for all generated tokens.
098         */
099        WORD,
100        /**
101         * An NGRAM corresponds to a token that might correspond to a character ngram -
102         * i.e. some portion / sub-span of a regular word token (for example.)
103         */
104        NGRAM,
105        /**
106         * A PUNCTUATION corresponds to tokens consisting of punctuation characters. In
107         * some applications, a PUNCTUATION may be treated differently because they may
108         * have less semantic content than regular word tokens.
109         */
110        PUNCTUATION,
111        /**
112         * Some tokenizers may produce tokens corresponding to whitespace (e.g. space,
113         * tab, newline, etc.) It may be important for consumers of tokens generated by
114         * a tokenizer to ignore/skip WHITESPACE tokens to avoid unexpected behavior.
115         */
116        WHITESPACE,
117        /**
118         * Some tokenizers produce "sub-word" tokens. A PREFIX corresponds to a sub-word
119         * word-prefix token.
120         */
121        PREFIX,
122        /**
123         * Some tokenizers produce "sub-word" tokens. A SUFFIX corresponds to a sub-word
124         * word-suffix token.
125         */
126        SUFFIX,
127        /**
128         * Some tokenizers produce "sub-word" tokens. An INFIX corresponds to a sub-word
129         * "infix" token (i.e. from the middle).
130         */
131        INFIX,
132        /**
133         * Some tokenizers may work in concert with vocabulary data. Some applications
134         * may treat out-of-vocabulary tokens differently than other tokens. An UNKNOWN
135         * token corresponds to a token that is out-of-vocabulary or has never been seen
136         * before.
137         */
138        UNKNOWN
139    }
140
141}