001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.impl;
018
019import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
020import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
021import org.tribuo.util.tokens.Token;
022import org.tribuo.util.tokens.Tokenizer;
023
024/**
025 * This tokenizer is loosely based on the notion of word shape which is a common
026 * feature used in NLP. The idea here is that continuous runs of letters in the
027 * same character class will be grouped together. White space characters are
028 * used as delimiters. The character classes are: uppercase letters, lowercase
029 * letters, digits, and everything else goes into its own character class. So,
030 * for example, "1234abcd" would be split into "1234" and "abcd". And "!@#$"
031 * would result in four tokens. Please see unit tests.
032 * <p>
033 * Strings are split according to whitespace and contiguous runs of characters
034 * in the same character classes. Except for one exception - if uppercase
035 * letters are immediately followed by lowercase letters, then we keep them
036 * together. This has the effect of recognizing camel case and splits
037 * "CamelCase" into "Camel" and "Case". It also splits "ABCdef AAbb" into
038 * "ABCdef" and "AAbb".
039 */
040public class ShapeTokenizer implements Tokenizer {
041
042    private String cs;
043
044    private int pos;
045
046    private String token;
047
048    private StringBuilder tb = new StringBuilder();
049
050    private int start;
051
052    private int end;
053
054    private char currClass;
055
056    private int prevClass;
057
058    private boolean ready;
059
060    /**
061     * Constructs a ShapeTokenizer.
062     */
063    public ShapeTokenizer() { }
064
065    @Override
066    public ConfiguredObjectProvenance getProvenance() {
067        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
068    }
069
070    @Override
071    public void reset(CharSequence cs) {
072        this.cs = cs.toString();
073        pos = 0;
074        start = -1;
075        end = -1;
076        prevClass = -1;
077        token = null;
078        ready = false;
079    }
080
081    private char getClass(int cp) {
082        if (Character.isUpperCase(cp)) {
083            return 'A';
084        } else if (Character.isLowerCase(cp)) {
085            return 'a';
086        } else if (Character.isDigit(cp)) {
087            return '1';
088        } else if (Character.isWhitespace(cp)) {
089            return ' ';
090        } else {
091            return (char) cp;
092        }
093    }
094
095    @Override
096    public boolean advance() {
097        if (cs == null) {
098            throw new IllegalStateException("ShapeTokenizer has not been reset.");
099        }
100        tb.delete(0, tb.length());
101        start = pos;
102        while (pos < cs.length()) {
103            int cp = cs.codePointAt(pos);
104            int lcp = Character.charCount(cp);
105
106            currClass = getClass(cp);
107
108            //
109            // Skip spaces at the start of the token.
110            if (tb.length() == 0 && currClass == ' ') {
111                pos += lcp;
112                start = pos;
113                prevClass = currClass;
114                continue;
115            }
116
117            //
118            // When do we want to end the current token? When we cross a boundary
119            // between token classes when we're not at the start of the string,
120            // except when that boundary is between 
121            // upper and lower case characters.
122            if (currClass != prevClass && prevClass != -1) {
123                if (!(prevClass == 'A' && currClass == 'a')) {
124                    if (tb.length() > 0) {
125                        token = tb.toString();
126                        prevClass = currClass;
127                        //
128                        // Note that we're not increasing pos here: we want
129                        // to work on this current character the next time that
130                        // we get called!
131                        ready = true;
132                        return true;
133                    }
134                }
135            }
136
137            //
138            // We didn't end the token, so collect the current character,
139            // unless it's a space!
140            if (currClass != ' ') {
141                tb.appendCodePoint(cp);
142            }
143            prevClass = currClass;
144            pos += lcp;
145            end = pos;
146        }
147
148        if (tb.length() > 0) {
149            token = tb.toString();
150            ready = true;
151            return true;
152        }
153
154        return false;
155    }
156
157    @Override
158    public String getText() {
159        if (ready) {
160            return token;
161        } else {
162            throw new IllegalStateException("ShapeTokenizer is not ready.");
163        }
164    }
165
166    @Override
167    public int getStart() {
168        if (ready) {
169            return start;
170        } else {
171            throw new IllegalStateException("ShapeTokenizer is not ready.");
172        }
173    }
174
175    @Override
176    public int getEnd() {
177        if (ready) {
178            return end;
179        } else {
180            throw new IllegalStateException("ShapeTokenizer is not ready.");
181        }
182    }
183
184    @Override
185    public Token.TokenType getType() {
186        if (ready) {
187            return Token.TokenType.WORD;
188        } else {
189            throw new IllegalStateException("ShapeTokenizer is not ready.");
190        }
191    }
192
193    @Override
194    public ShapeTokenizer clone() {
195        try {
196            ShapeTokenizer copy = (ShapeTokenizer) super.clone();
197            copy.tb = new StringBuilder();
198            copy.ready = false;
199            copy.cs = null;
200            return copy;
201        } catch (CloneNotSupportedException e) {
202            throw new AssertionError("ShapeTokenizer is Cloneable, but clone call failed");
203        }
204    }
205
206}