001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens;
018
019import com.oracle.labs.mlrg.olcut.config.Configurable;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.Provenancable;
022
023import java.util.ArrayList;
024import java.util.Collections;
025import java.util.List;
026import java.util.function.Supplier;
027
028/**
029 * An interface for things that tokenize text: breaking it into words according
030 * to some set of rules.
031 * <p>
032 * Note that tokenizers are not guaranteed to be thread safe! Using the same
033 * tokenizer from multiple threads may result in strange behavior.
034 * <p>
035 * Tokenizers which are not ready throw {@link IllegalStateException}
036 * when {@link Tokenizer#advance} or any get method is called.
037 * <p>
038 * Most Tokenizers are Cloneable, and implement the Cloneable interface.
039 */
040public interface Tokenizer extends Configurable, Cloneable, Provenancable<ConfiguredObjectProvenance> {
041
042    /**
043     * Creates a supplier from the specified tokenizer by cloning it.
044     * @param tokenizer The tokenizer to copy.
045     * @return A supplier of tokenizers.
046     */
047    static Supplier<Tokenizer> createSupplier(Tokenizer tokenizer) {
048        Supplier<Tokenizer> supplier = () -> {
049            try {
050                return tokenizer.clone();
051            } catch (CloneNotSupportedException e) {
052                throw new RuntimeException(e);
053            }
054        };
055        return supplier;
056    }
057
058    /**
059     * Creates a thread local source of tokenizers by making a Tokenizer supplier using {@link #createSupplier(Tokenizer)}.
060     * @param tokenizer The tokenizer to copy.
061     * @return A thread local for tokenizers.
062     */
063    static ThreadLocal<Tokenizer> createThreadLocal(Tokenizer tokenizer) {
064        return ThreadLocal.withInitial(createSupplier(tokenizer));
065    }
066
067    /**
068     * Resets the tokenizer so that it operates on a new sequence of characters.
069     *
070     * @param cs a character sequence to tokenize
071     */
072    public void reset(CharSequence cs);
073
074    /**
075     * Advances the tokenizer to the next token.
076     *
077     * @return {@code true} if there is such a token, {@code false}
078     * otherwise.
079     */
080    public boolean advance();
081
082    /**
083     * Gets the text of the current token, as a string
084     *
085     * @return the text of the current token
086     */
087    public String getText();
088
089    /**
090     * Gets the starting character offset of the current token in the character
091     * sequence
092     *
093     * @return the starting character offset of the token
094     */
095    public int getStart();
096
097    /**
098     * Gets the ending offset (exclusive) of the current token in the character
099     * sequence
100     *
101     * @return the exclusive ending character offset for the current token.
102     */
103    public int getEnd();
104
105    /**
106     * Gets the type of the current token.
107     *
108     * @return the type of the current token.
109     */
110    public Token.TokenType getType();
111
112    /**
113     * Clones a tokenizer with it's configuration. Cloned tokenizers are
114     * not processing the same text as the original tokenizer and need to be reset
115     * with a fresh CharSequence.
116     *
117     * @return A tokenizer with the same configuration, but independent state.
118     * @throws CloneNotSupportedException if the tokenizer isn't cloneable.
119     */
120    public Tokenizer clone() throws CloneNotSupportedException;
121
122    /**
123     * Generates a Token object from the current state of the tokenizer.
124     * @return The token object from the current state.
125     */
126    default public Token getToken() {
127        return new Token(getText(), getStart(), getEnd(), getType());
128    }
129
130    /**
131     * Uses this tokenizer to tokenize a string and return the list of tokens
132     * that were generated. Many applications will simply want to take a
133     * character sequence and get a list of tokens, so this will do that for
134     * them.
135     *
136     * <p>
137     * Here is the contract of the tokenize function:
138     * <ul>
139     * <li>all returned tokens correspond to substrings of the input text</li>
140     * <li>the tokens do not overlap</li>
141     * <li>the tokens are returned in the order that they appear in the text
142     * </li>
143     * <li>the value of Token.text should be the same as calling
144     * text.substring(token.start, token.end)
145     * </ul>
146     *
147     * @param cs a sequence of characters to tokenize
148     * @return the tokens discovered in the character sequence, in order
149     * (true?).
150     */
151    default List<Token> tokenize(CharSequence cs) {
152        if (cs == null || cs.length() == 0) {
153            return Collections.emptyList();
154        }
155        List<Token> tokens = new ArrayList<>();
156        reset(cs);
157        while (advance()) {
158            tokens.add(getToken());
159        }
160        return tokens;
161    }
162
163    /**
164     * Uses this tokenizer to split a string into it's component substrings.
165     * Many applications will simply want the component strings making up a
166     * larger character sequence.
167     *
168     * @param cs the character sequence to tokenize
169     * @return a list of strings making up the character sequence.
170     */
171    default List<String> split(CharSequence cs) {
172        if (cs == null || cs.length() == 0) {
173            return Collections.emptyList();
174        }
175        List<String> tokens = new ArrayList<>();
176        reset(cs);
177        while (advance()) {
178            tokens.add(getText());
179        }
180        return tokens;
181    }
182}