001/*
002 * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.impl;
018
019import org.tribuo.util.tokens.Token;
020import org.tribuo.util.tokens.Token.TokenType;
021import org.tribuo.util.tokens.Tokenizer;
022
023/**
024 * This class supports character-by-character (that is, codepoint-by-codepoint)
025 * iteration over input text to create tokens. Extensions of this class are
026 * initialized with a {@link SplitFunction} which will be called for each character and
027 * a {@link SplitResult} consisting of a {@link SplitType} and a {@link TokenType} will be returned.
028 * Tokenization is achieved based on the {@link SplitResult} returned for each
029 * character. Please see notes below for each {@link SplitType} and {@link SplitResult}.
030 */
031public abstract class SplitFunctionTokenizer implements Tokenizer {
032
033    /**
034     * Defines different ways that a tokenizer can split the input text at a given character.
035     */
036    public enum SplitType {
037        /**
038         * the current character is added to the in-progress token (i.e. do not split on
039         * the current character)
040         */
041        NO_SPLIT,
042        /**
043         * The current character will cause the in-progress token to be completed. the
044         * current character will not be included in any returned token and the token
045         * type of the corresponding SplitResult is ignored (See {@link SplitResult#SPLIT_AT}).
046         * This SplitType may be useful for whitespace.
047         */
048        SPLIT_AT,
049        /**
050         * The current character will cause the in-progress token to be completed the
051         * current character will be included in the next token. The token type of the
052         * corresponding SplitResult is ignored (See {@link SplitResult#SPLIT_BEFORE}). This
053         * SplitType may be useful for e.g. capitalized letters when CamelCase splitting
054         * of digits when separating out a currency symbol.
055         */
056        SPLIT_BEFORE,
057        /**
058         * The current character will cause the in-progress token to be completed after
059         * the current character is appended to the in-progress token. The token type of
060         * the created token (that includes the current character) will be assigned the
061         * type included with the {@link SplitResult}.
062         */
063        SPLIT_AFTER,
064        /**
065         * The current character should cause the in-progress token to be completed. The
066         * token assigned to the in-progress token will be whatever was previously
067         * assigned to the previous character. This token will be followed by a second
068         * single-character token consisting of the current character. The token type
069         * assigned to this second token will be provided with the {@link SplitResult}.
070         */
071        SPLIT_BEFORE_AND_AFTER
072    }
073
074    /**
075     * A combination of a {@link SplitType} and a {@link TokenType}. The TokenType of some
076     * SplitResult values are ignored and so not every combination of SplitType and
077     * TokenType is provided. For example, {@link SplitType#SPLIT_AT} and
078     * {@link SplitType#SPLIT_BEFORE} (as described above) create tokens whose types have
079     * already been determined.
080     */
081    public enum SplitResult {
082        /**
083         * Not a split, is a word.
084         */
085        NO_SPLIT_WORD(SplitType.NO_SPLIT, TokenType.WORD),
086        /**
087         * Not a split, is a ngram.
088         */
089        NO_SPLIT_NGRAM(SplitType.NO_SPLIT, TokenType.NGRAM),
090        /**
091         * Not a split, is punctuation.
092         */
093        NO_SPLIT_PUNCTUATION(SplitType.NO_SPLIT, TokenType.PUNCTUATION),
094        /**
095         * Not a split, is whitespace.
096         */
097        NO_SPLIT_WHITESPACE(SplitType.NO_SPLIT, TokenType.WHITESPACE),
098        /**
099         * Not a split, is a prefix.
100         */
101        NO_SPLIT_PREFIX(SplitType.NO_SPLIT, TokenType.PREFIX),
102        /**
103         * Not a split, is a suffix.
104         */
105        NO_SPLIT_SUFFIX(SplitType.NO_SPLIT, TokenType.SUFFIX),
106        /**
107         * Not a split, is infix.
108         */
109        NO_SPLIT_INFIX(SplitType.NO_SPLIT, TokenType.INFIX),
110        /**
111         * Not a split, is unknown.
112         */
113        NO_SPLIT_UNKNOWN(SplitType.NO_SPLIT, TokenType.UNKNOWN),
114        /**
115         * Split at.
116         */
117        SPLIT_AT(SplitType.SPLIT_AT, TokenType.WORD), //the token type is ignored
118        /**
119         * Split before.
120         */
121        SPLIT_BEFORE(SplitType.SPLIT_BEFORE, TokenType.WORD), //the token type is ignored
122        /**
123         * Split after a word.
124         */
125        SPLIT_AFTER_WORD(SplitType.SPLIT_AFTER, TokenType.WORD),
126        /**
127         * Split after a ngram.
128         */
129        SPLIT_AFTER_NGRAM(SplitType.SPLIT_AFTER, TokenType.NGRAM),
130        /**
131         * Split after punctuation.
132         */
133        SPLIT_AFTER_PUNCTUATION(SplitType.SPLIT_AFTER, TokenType.PUNCTUATION),
134        /**
135         * Split after whitespace.
136         */
137        SPLIT_AFTER_WHITESPACE(SplitType.SPLIT_AFTER, TokenType.WHITESPACE),
138        /**
139         * Split after a prefix.
140         */
141        SPLIT_AFTER_PREFIX(SplitType.SPLIT_AFTER, TokenType.PREFIX),
142        /**
143         * Split after a suffix.
144         */
145        SPLIT_AFTER_SUFFIX(SplitType.SPLIT_AFTER, TokenType.SUFFIX),
146        /**
147         * Split after infix.
148         */
149        SPLIT_AFTER_INFIX(SplitType.SPLIT_AFTER, TokenType.INFIX),
150        /**
151         * Split after an unknown value.
152         */
153        SPLIT_AFTER_UNKNOWN(SplitType.SPLIT_AFTER, TokenType.UNKNOWN),
154        /**
155         * Split before and after a word.
156         */
157        SPLIT_BEFORE_AND_AFTER_WORD(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.WORD),
158        /**
159         * Split before and after a ngram.
160         */
161        SPLIT_BEFORE_AND_AFTER_NGRAM(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.NGRAM),
162        /**
163         * Split before and after punctuation.
164         */
165        SPLIT_BEFORE_AND_AFTER_PUNCTUATION(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.PUNCTUATION),
166        /**
167         * Split before and after whitespace.
168         */
169        SPLIT_BEFORE_AND_AFTER_WHITESPACE(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.WHITESPACE),
170        /**
171         * Split before and after prefix.
172         */
173        SPLIT_BEFORE_AND_AFTER_PREFIX(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.PREFIX),
174        /**
175         * Split before and after suffix.
176         */
177        SPLIT_BEFORE_AND_AFTER_SUFFIX(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.SUFFIX),
178        /**
179         * Split before and after infix.
180         */
181        SPLIT_BEFORE_AND_AFTER_INFIX(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.INFIX),
182        /**
183         * Split before and after unknown.
184         */
185        SPLIT_BEFORE_AND_AFTER_UNKNOWN(SplitType.SPLIT_BEFORE_AND_AFTER, TokenType.UNKNOWN);
186
187        /**
188         * The split type.
189         */
190        public final SplitType splitType;
191        /**
192         * The token type.
193         */
194        public final TokenType tokenType;
195
196        SplitResult(SplitType splitType, TokenType tokenType) {
197            this.splitType = splitType;
198            this.tokenType = tokenType;
199        }
200    }
201
202    /**
203     * An interface for checking if the text should be split at the supplied codepoint.
204     */
205    @FunctionalInterface
206    public static interface SplitFunction {
207        /**
208         * Applies the split function.
209         * @param codepoint The codepoint to check.
210         * @param index The character index.
211         * @param cs The sequence that's being split.
212         * @return How the sequence should be split.
213         */
214        public SplitResult apply(int codepoint, int index, CharSequence cs);
215    }
216
217    /**
218     * The splitting function.
219     */
220    protected SplitFunction splitFunction;
221
222    /**
223     * Constructs a tokenizer, used by OLCUT.
224     */
225    protected SplitFunctionTokenizer() { }
226
227    /**
228     * Creates a new tokenizer using the supplied split function.
229     * @param splitFunction The split function.
230     */
231    public SplitFunctionTokenizer(SplitFunction splitFunction) {
232        super();
233        this.splitFunction = splitFunction;
234    }
235
236    private String cs;
237
238    private int start;
239
240    private int p;
241
242    private StringBuilder tokenSb = new StringBuilder();
243
244    private TokenType currentType = TokenType.WORD;
245
246    private Token currentToken;
247
248    private Token nextToken;
249
250    private boolean ready;
251
252    @Override
253    public void reset(CharSequence cs) {
254        this.cs = cs.toString();
255        start = 0;
256        p = 0;
257        tokenSb.delete(0, tokenSb.length());
258        ready = false;
259    }
260
261    @Override
262    public boolean advance() {
263        if (cs == null) {
264            throw new IllegalStateException("SplitFunctionTokenizer has not been reset.");
265        }
266        if (nextToken != null) {
267            currentToken = nextToken;
268            nextToken = null;
269            return true;
270        }
271        if (p >= cs.length()) {
272            return false;
273        }
274
275        currentToken = null;
276
277        SplitResult splitResult;
278        SplitType splitType;
279        TokenType tokenType;
280
281        tokenSb.delete(0, tokenSb.length());
282        while (p < cs.length()) {
283            int codepoint = cs.codePointAt(p);
284            splitResult = splitFunction.apply(codepoint, p, cs);
285            splitType = splitResult.splitType;
286            tokenType = splitResult.tokenType;
287            // If we want to keep it, then go ahead and do that and remember
288            // where the end of the token is.
289            if (splitType == SplitType.NO_SPLIT) {
290                if (tokenSb.length() == 0) {
291                    start = p;
292                }
293                p += Character.charCount(codepoint);
294                tokenSb.appendCodePoint(codepoint);
295                currentType = tokenType;
296                continue;
297            }
298
299            if (splitType == SplitType.SPLIT_AT) {
300                if (tokenSb.length() > 0) {
301                    currentToken = new Token(tokenSb.toString(), start, p, currentType);
302                }
303                p += Character.charCount(codepoint);
304                start = p;
305                tokenSb.delete(0, tokenSb.length());
306            } else if (splitType == SplitType.SPLIT_BEFORE) {
307                if (tokenSb.length() > 0) {
308                    currentToken = new Token(tokenSb.toString(), start, p, currentType);
309                }
310                start = p;
311                tokenSb.delete(0, tokenSb.length());
312                tokenSb.appendCodePoint(codepoint);
313                p += Character.charCount(codepoint);
314            } else if (splitType == SplitType.SPLIT_AFTER) {
315                p += Character.charCount(codepoint);
316                tokenSb.appendCodePoint(codepoint);
317                // no need to check the length since we just added a code point
318                currentToken = new Token(tokenSb.toString(), start, p, tokenType);
319                tokenSb.delete(0, tokenSb.length());
320                start = p;
321            } else if (splitType == SplitType.SPLIT_BEFORE_AND_AFTER) {
322                // wrap up the token we are currently building and then create
323                // the next token which consists of just the character
324                if (tokenSb.length() > 0) {
325                    currentToken = new Token(tokenSb.toString(), start, p, currentType);
326                    tokenSb.delete(0, tokenSb.length());
327                    start = p;
328                    p += Character.charCount(codepoint);
329                    tokenSb.appendCodePoint(codepoint);
330                    nextToken = new Token(tokenSb.toString(), start, p, tokenType);
331                    tokenSb.delete(0, tokenSb.length());
332                } else {
333                    start = p;
334                    p += Character.charCount(codepoint);
335                    tokenSb.appendCodePoint(codepoint);
336                    currentToken = new Token(tokenSb.toString(), start, p, tokenType);
337                    tokenSb.delete(0, tokenSb.length());
338                }
339            }
340            if (currentToken != null) {
341                break;
342            }
343        }
344
345        if (currentToken == null) {
346            if (tokenSb.length() > 0) {
347                currentToken = new Token(tokenSb.toString(), start, p, currentType);
348            }
349        }
350
351        // We advanced if we have some stuff collected.
352        if (currentToken != null) {
353            ready = true;
354            return true;
355        } else {
356            return false;
357        }
358    }
359
360    @Override
361    public String getText() {
362        if (ready) {
363            return currentToken.text;
364        } else {
365            throw new IllegalStateException("SplitFunctionTokenizer is not ready.");
366        }
367    }
368
369    @Override
370    public int getStart() {
371        if (ready) {
372            return currentToken.start;
373        } else {
374            throw new IllegalStateException("SplitFunctionTokenizer is not ready.");
375        }
376    }
377
378    @Override
379    public int getEnd() {
380        if (ready) {
381            return currentToken.end;
382        } else {
383            throw new IllegalStateException("SplitFunctionTokenizer is not ready.");
384        }
385    }
386
387    @Override
388    public TokenType getType() {
389        return currentToken.type;
390    }
391
392    @Override
393    public Tokenizer clone() throws CloneNotSupportedException {
394        throw new UnsupportedOperationException(
395                "abstract class SplitFunctionTokenizer does not implement clone method.  Subclasses must implement this method.");
396    }
397
398}