001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens; 018 019import com.oracle.labs.mlrg.olcut.config.Configurable; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.Provenancable; 022 023import java.util.ArrayList; 024import java.util.Collections; 025import java.util.List; 026import java.util.function.Supplier; 027 028/** 029 * An interface for things that tokenize text: breaking it into words according 030 * to some set of rules. 031 * <p> 032 * Note that tokenizers are not guaranteed to be thread safe! Using the same 033 * tokenizer from multiple threads may result in strange behavior. 034 * <p> 035 * Tokenizers which are not ready throw {@link IllegalStateException} 036 * when {@link Tokenizer#advance} or any get method is called. 037 * <p> 038 * Most Tokenizers are Cloneable, and implement the Cloneable interface. 039 */ 040public interface Tokenizer extends Configurable, Cloneable, Provenancable<ConfiguredObjectProvenance> { 041 042 /** 043 * Creates a supplier from the specified tokenizer by cloning it. 044 * @param tokenizer The tokenizer to copy. 045 * @return A supplier of tokenizers. 046 */ 047 static Supplier<Tokenizer> createSupplier(Tokenizer tokenizer) { 048 Supplier<Tokenizer> supplier = () -> { 049 try { 050 return tokenizer.clone(); 051 } catch (CloneNotSupportedException e) { 052 throw new RuntimeException(e); 053 } 054 }; 055 return supplier; 056 } 057 058 /** 059 * Creates a thread local source of tokenizers by making a Tokenizer supplier using {@link #createSupplier(Tokenizer)}. 060 * @param tokenizer The tokenizer to copy. 061 * @return A thread local for tokenizers. 062 */ 063 static ThreadLocal<Tokenizer> createThreadLocal(Tokenizer tokenizer) { 064 return ThreadLocal.withInitial(createSupplier(tokenizer)); 065 } 066 067 /** 068 * Resets the tokenizer so that it operates on a new sequence of characters. 069 * 070 * @param cs a character sequence to tokenize 071 */ 072 public void reset(CharSequence cs); 073 074 /** 075 * Advances the tokenizer to the next token. 076 * 077 * @return {@code true} if there is such a token, {@code false} 078 * otherwise. 079 */ 080 public boolean advance(); 081 082 /** 083 * Gets the text of the current token, as a string 084 * 085 * @return the text of the current token 086 */ 087 public String getText(); 088 089 /** 090 * Gets the starting character offset of the current token in the character 091 * sequence 092 * 093 * @return the starting character offset of the token 094 */ 095 public int getStart(); 096 097 /** 098 * Gets the ending offset (exclusive) of the current token in the character 099 * sequence 100 * 101 * @return the exclusive ending character offset for the current token. 102 */ 103 public int getEnd(); 104 105 /** 106 * Gets the type of the current token. 107 * 108 * @return the type of the current token. 109 */ 110 public Token.TokenType getType(); 111 112 /** 113 * Clones a tokenizer with it's configuration. Cloned tokenizers are 114 * not processing the same text as the original tokenizer and need to be reset 115 * with a fresh CharSequence. 116 * 117 * @return A tokenizer with the same configuration, but independent state. 118 * @throws CloneNotSupportedException if the tokenizer isn't cloneable. 119 */ 120 public Tokenizer clone() throws CloneNotSupportedException; 121 122 /** 123 * Generates a Token object from the current state of the tokenizer. 124 * @return The token object from the current state. 125 */ 126 default public Token getToken() { 127 return new Token(getText(), getStart(), getEnd(), getType()); 128 } 129 130 /** 131 * Uses this tokenizer to tokenize a string and return the list of tokens 132 * that were generated. Many applications will simply want to take a 133 * character sequence and get a list of tokens, so this will do that for 134 * them. 135 * 136 * <p> 137 * Here is the contract of the tokenize function: 138 * <ul> 139 * <li>all returned tokens correspond to substrings of the input text</li> 140 * <li>the tokens do not overlap</li> 141 * <li>the tokens are returned in the order that they appear in the text 142 * </li> 143 * <li>the value of Token.text should be the same as calling 144 * text.substring(token.start, token.end) 145 * </ul> 146 * 147 * @param cs a sequence of characters to tokenize 148 * @return the tokens discovered in the character sequence, in order 149 * (true?). 150 */ 151 default List<Token> tokenize(CharSequence cs) { 152 if (cs == null || cs.length() == 0) { 153 return Collections.emptyList(); 154 } 155 List<Token> tokens = new ArrayList<>(); 156 reset(cs); 157 while (advance()) { 158 tokens.add(getToken()); 159 } 160 return tokens; 161 } 162 163 /** 164 * Uses this tokenizer to split a string into it's component substrings. 165 * Many applications will simply want the component strings making up a 166 * larger character sequence. 167 * 168 * @param cs the character sequence to tokenize 169 * @return a list of strings making up the character sequence. 170 */ 171 default List<String> split(CharSequence cs) { 172 if (cs == null || cs.length() == 0) { 173 return Collections.emptyList(); 174 } 175 List<String> tokens = new ArrayList<>(); 176 reset(cs); 177 while (advance()) { 178 tokens.add(getText()); 179 } 180 return tokens; 181 } 182}