001/*
002 * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.impl;
018
019import java.util.Arrays;
020
021import org.tribuo.util.tokens.Tokenizer;
022
023import com.oracle.labs.mlrg.olcut.config.Config;
024import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
025import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
026
027/**
028 * This implementation of {@link Tokenizer} is instantiated with an array of
029 * characters that are considered split characters. That is, the split
030 * characters define where to split the input text. It's a very simplistic
031 * tokenizer that has one simple exceptional case that it handles: how to deal
032 * with split characters that appear in between digits (e.g., 3/5 and 3.1415).
033 * It's not really very general purpose, but may suffice for some use cases.
034 * <p>
035 * In addition to the split characters specified it also splits on anything that
036 * is considered whitespace by {@link Character#isWhitespace(char)}.
037 * 
038 * @author Philip Ogren
039 */
040public class SplitCharactersTokenizer extends SplitFunctionTokenizer {
041
042    /**
043     * The default split characters.
044     */
045    public static final char[] DEFAULT_SPLIT_CHARACTERS = new char[] { '*', '(', ')', '&', '[', ']', '{', '}', '`',
046            '\'', '|', ';', ':', '\\', '!', '-', '?' };
047    /**
048     * The default characters which don't cause splits inside digits.
049     */
050    public static final char[] DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS = new char[] { '.', ',', '/', };
051
052    /**
053     * Splits tokens at the supplied characters.
054     */
055    public static class SplitCharactersSplitterFunction implements SplitFunction {
056
057        private final char[] splitCharacters;
058
059        private final char[] splitXDigitsCharacters;
060
061        /**
062         * Constructs a splitting function using the supplied split characters.
063         * @param splitCharacters The characters to split on.
064         * @param splitXDigitsCharacters Characters that are valid split points outside of a run of digits.
065         */
066        public SplitCharactersSplitterFunction(char[] splitCharacters, char[] splitXDigitsCharacters) {
067            this.splitCharacters = splitCharacters;
068            this.splitXDigitsCharacters = splitXDigitsCharacters;
069        }
070
071        @Override
072        public SplitResult apply(int codepoint, int index, CharSequence cs) {
073            if (isSplitCharacter((char) codepoint)) {
074                return SplitResult.SPLIT_AT;
075            }
076            if (isSplitXDigitCharacter((char) codepoint)) {
077                if (index == 0 || index == cs.length() - 1 || !Character.isDigit(cs.charAt(index - 1))
078                        || !Character.isDigit(cs.charAt(index + 1))) {
079                    return SplitResult.SPLIT_AT;
080                }
081            }
082            return SplitResult.NO_SPLIT_WORD;
083        }
084
085        /**
086         * Checks if this is a valid split character or whitespace.
087         * @param c The character to check.
088         * @return True if the character should split the token.
089         */
090        public boolean isSplitCharacter(char c) {
091            return isCharacter(c, splitCharacters) || Character.isWhitespace(c);
092        }
093
094        /**
095         * Checks if this a valid split character outside of a run of digits.
096         * @param c The character to check.
097         * @return True if the character should split the token.
098         */
099        public boolean isSplitXDigitCharacter(char c) {
100            return isCharacter(c, splitXDigitsCharacters);
101        }
102
103    }
104
105    @Config(description = "The characters to split on.")
106    private char[] splitCharacters = DEFAULT_SPLIT_CHARACTERS;
107
108    @Config(description = "The characters to split on unless we're in a number.")
109    private char[] splitXDigitsCharacters = DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS;
110
111    /**
112     * Creates a default split characters tokenizer using
113     * {@link #DEFAULT_SPLIT_CHARACTERS} and
114     * {@link #DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS}.
115     */
116    public SplitCharactersTokenizer() {
117        this.postConfig(); // I feel like I need to call this explicitly in case someone uses the default
118                           // constructor
119    }
120
121    @Override
122    public void postConfig() {
123        this.splitFunction = new SplitCharactersSplitterFunction(splitCharacters, splitXDigitsCharacters);
124    }
125
126    /**
127     * @param splitCharacters        characters to be replaced with a space in the
128     *                               input text (e.g., "abc|def" becomes "abc def")
129     * @param splitXDigitsCharacters characters to be replaced with a space in the
130     *                               input text except in the circumstance where the
131     *                               character immediately adjacent to the left and
132     *                               right are digits (e.g., "abc.def" becomes "abc
133     *                               def" but "3.1415" remains "3.1415").
134     */
135    public SplitCharactersTokenizer(char[] splitCharacters, char[] splitXDigitsCharacters) {
136        this.splitCharacters = splitCharacters;
137        this.splitXDigitsCharacters = splitXDigitsCharacters;
138        this.postConfig();
139    }
140
141    /**
142     * Creates a tokenizer that splits on whitespace.
143     * 
144     * @return A whitespace tokenizer.
145     */
146    public static SplitCharactersTokenizer createWhitespaceTokenizer() {
147        return new SplitCharactersTokenizer(new char[0], new char[0]);
148    }
149
150    @Override
151    public ConfiguredObjectProvenance getProvenance() {
152        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
153    }
154
155    /**
156     * Is this character a split character for this tokenizer instance.
157     * 
158     * @param c The character to check.
159     * @return True if it's a split character.
160     */
161    @Deprecated
162    public boolean isSplitCharacter(char c) {
163        return isCharacter(c, splitCharacters) || Character.isWhitespace(c);
164    }
165
166    /**
167     * Is this character a split character except inside a digit for this tokenizer
168     * instance.
169     * 
170     * @param c The character to check.
171     * @return True if it's a split character.
172     */
173    @Deprecated
174    public boolean isSplitXDigitCharacter(char c) {
175        return isCharacter(c, splitXDigitsCharacters);
176    }
177
178    private static boolean isCharacter(char c, char[] chars) {
179        if (chars == null) {
180            return false;
181        }
182        for (char ch : chars) {
183            if (ch == c) {
184                return true;
185            }
186        }
187        return false;
188    }
189
190    /**
191     * Returns a copy of the split characters.
192     * 
193     * @return A copy of the split characters.
194     */
195    @Deprecated
196    public char[] getSplitCharacters() {
197        return Arrays.copyOf(splitCharacters, splitCharacters.length);
198    }
199
200    /**
201     * Returns a copy of the split characters except inside digits.
202     * 
203     * @return A copy of the split characters.
204     */
205    @Deprecated
206    public char[] getSplitXDigitsCharacters() {
207        return Arrays.copyOf(splitXDigitsCharacters, splitXDigitsCharacters.length);
208    }
209
210    @Override
211    public SplitCharactersTokenizer clone() {
212        return new SplitCharactersTokenizer(splitCharacters, splitXDigitsCharacters);
213    }
214
215}