001/* 002 * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.impl; 018 019import java.util.Arrays; 020 021import org.tribuo.util.tokens.Tokenizer; 022 023import com.oracle.labs.mlrg.olcut.config.Config; 024import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 025import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 026 027/** 028 * This implementation of {@link Tokenizer} is instantiated with an array of 029 * characters that are considered split characters. That is, the split 030 * characters define where to split the input text. It's a very simplistic 031 * tokenizer that has one simple exceptional case that it handles: how to deal 032 * with split characters that appear in between digits (e.g., 3/5 and 3.1415). 033 * It's not really very general purpose, but may suffice for some use cases. 034 * <p> 035 * In addition to the split characters specified it also splits on anything that 036 * is considered whitespace by {@link Character#isWhitespace(char)}. 037 * 038 * @author Philip Ogren 039 */ 040public class SplitCharactersTokenizer extends SplitFunctionTokenizer { 041 042 /** 043 * The default split characters. 044 */ 045 public static final char[] DEFAULT_SPLIT_CHARACTERS = new char[] { '*', '(', ')', '&', '[', ']', '{', '}', '`', 046 '\'', '|', ';', ':', '\\', '!', '-', '?' }; 047 /** 048 * The default characters which don't cause splits inside digits. 049 */ 050 public static final char[] DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS = new char[] { '.', ',', '/', }; 051 052 /** 053 * Splits tokens at the supplied characters. 054 */ 055 public static class SplitCharactersSplitterFunction implements SplitFunction { 056 057 private final char[] splitCharacters; 058 059 private final char[] splitXDigitsCharacters; 060 061 /** 062 * Constructs a splitting function using the supplied split characters. 063 * @param splitCharacters The characters to split on. 064 * @param splitXDigitsCharacters Characters that are valid split points outside of a run of digits. 065 */ 066 public SplitCharactersSplitterFunction(char[] splitCharacters, char[] splitXDigitsCharacters) { 067 this.splitCharacters = splitCharacters; 068 this.splitXDigitsCharacters = splitXDigitsCharacters; 069 } 070 071 @Override 072 public SplitResult apply(int codepoint, int index, CharSequence cs) { 073 if (isSplitCharacter((char) codepoint)) { 074 return SplitResult.SPLIT_AT; 075 } 076 if (isSplitXDigitCharacter((char) codepoint)) { 077 if (index == 0 || index == cs.length() - 1 || !Character.isDigit(cs.charAt(index - 1)) 078 || !Character.isDigit(cs.charAt(index + 1))) { 079 return SplitResult.SPLIT_AT; 080 } 081 } 082 return SplitResult.NO_SPLIT_WORD; 083 } 084 085 /** 086 * Checks if this is a valid split character or whitespace. 087 * @param c The character to check. 088 * @return True if the character should split the token. 089 */ 090 public boolean isSplitCharacter(char c) { 091 return isCharacter(c, splitCharacters) || Character.isWhitespace(c); 092 } 093 094 /** 095 * Checks if this a valid split character outside of a run of digits. 096 * @param c The character to check. 097 * @return True if the character should split the token. 098 */ 099 public boolean isSplitXDigitCharacter(char c) { 100 return isCharacter(c, splitXDigitsCharacters); 101 } 102 103 } 104 105 @Config(description = "The characters to split on.") 106 private char[] splitCharacters = DEFAULT_SPLIT_CHARACTERS; 107 108 @Config(description = "The characters to split on unless we're in a number.") 109 private char[] splitXDigitsCharacters = DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS; 110 111 /** 112 * Creates a default split characters tokenizer using 113 * {@link #DEFAULT_SPLIT_CHARACTERS} and 114 * {@link #DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS}. 115 */ 116 public SplitCharactersTokenizer() { 117 this.postConfig(); // I feel like I need to call this explicitly in case someone uses the default 118 // constructor 119 } 120 121 @Override 122 public void postConfig() { 123 this.splitFunction = new SplitCharactersSplitterFunction(splitCharacters, splitXDigitsCharacters); 124 } 125 126 /** 127 * @param splitCharacters characters to be replaced with a space in the 128 * input text (e.g., "abc|def" becomes "abc def") 129 * @param splitXDigitsCharacters characters to be replaced with a space in the 130 * input text except in the circumstance where the 131 * character immediately adjacent to the left and 132 * right are digits (e.g., "abc.def" becomes "abc 133 * def" but "3.1415" remains "3.1415"). 134 */ 135 public SplitCharactersTokenizer(char[] splitCharacters, char[] splitXDigitsCharacters) { 136 this.splitCharacters = splitCharacters; 137 this.splitXDigitsCharacters = splitXDigitsCharacters; 138 this.postConfig(); 139 } 140 141 /** 142 * Creates a tokenizer that splits on whitespace. 143 * 144 * @return A whitespace tokenizer. 145 */ 146 public static SplitCharactersTokenizer createWhitespaceTokenizer() { 147 return new SplitCharactersTokenizer(new char[0], new char[0]); 148 } 149 150 @Override 151 public ConfiguredObjectProvenance getProvenance() { 152 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 153 } 154 155 /** 156 * Is this character a split character for this tokenizer instance. 157 * 158 * @param c The character to check. 159 * @return True if it's a split character. 160 */ 161 @Deprecated 162 public boolean isSplitCharacter(char c) { 163 return isCharacter(c, splitCharacters) || Character.isWhitespace(c); 164 } 165 166 /** 167 * Is this character a split character except inside a digit for this tokenizer 168 * instance. 169 * 170 * @param c The character to check. 171 * @return True if it's a split character. 172 */ 173 @Deprecated 174 public boolean isSplitXDigitCharacter(char c) { 175 return isCharacter(c, splitXDigitsCharacters); 176 } 177 178 private static boolean isCharacter(char c, char[] chars) { 179 if (chars == null) { 180 return false; 181 } 182 for (char ch : chars) { 183 if (ch == c) { 184 return true; 185 } 186 } 187 return false; 188 } 189 190 /** 191 * Returns a copy of the split characters. 192 * 193 * @return A copy of the split characters. 194 */ 195 @Deprecated 196 public char[] getSplitCharacters() { 197 return Arrays.copyOf(splitCharacters, splitCharacters.length); 198 } 199 200 /** 201 * Returns a copy of the split characters except inside digits. 202 * 203 * @return A copy of the split characters. 204 */ 205 @Deprecated 206 public char[] getSplitXDigitsCharacters() { 207 return Arrays.copyOf(splitXDigitsCharacters, splitXDigitsCharacters.length); 208 } 209 210 @Override 211 public SplitCharactersTokenizer clone() { 212 return new SplitCharactersTokenizer(splitCharacters, splitXDigitsCharacters); 213 } 214 215}