001/* 002 * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.impl; 018 019import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 020import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 021 022/** 023 * A simple tokenizer that splits on whitespace. This tokenizer does not create 024 * tokens that correspond to whitespace - only those spans of text delimited by 025 * whitespace. For example, the text "a b" will result in two tokens "a" and "b". 026 */ 027public class WhitespaceTokenizer extends SplitFunctionTokenizer { 028 029 /** 030 * The splitting function for whitespace, using {@link Character#isWhitespace(char)}. 031 */ 032 public static final SplitFunction whitespaceSplitCharacterFunction = (codepoint, index, 033 cs) -> Character.isWhitespace(codepoint) ? SplitResult.SPLIT_AT : SplitResult.NO_SPLIT_WORD; 034 035 /** 036 * Constructs a tokenizer that splits on whitespace. 037 */ 038 public WhitespaceTokenizer() { 039 super(whitespaceSplitCharacterFunction); 040 } 041 042 @Override 043 public ConfiguredObjectProvenance getProvenance() { 044 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 045 } 046 047 @Override 048 public WhitespaceTokenizer clone() { 049 return new WhitespaceTokenizer(); 050 } 051 052}