001/*
002 * Copyright (c) 2015-2021, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.impl;
018
019import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
020import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
021
022/**
023 * A simple tokenizer that splits on whitespace.  This tokenizer does not create
024 * tokens that correspond to whitespace - only those spans of text delimited by
025 * whitespace.  For example, the text "a b" will result in two tokens "a" and "b". 
026 */
027public class WhitespaceTokenizer extends SplitFunctionTokenizer {
028
029    /**
030     * The splitting function for whitespace, using {@link Character#isWhitespace(char)}.
031     */
032    public static final SplitFunction whitespaceSplitCharacterFunction = (codepoint, index,
033            cs) -> Character.isWhitespace(codepoint) ? SplitResult.SPLIT_AT : SplitResult.NO_SPLIT_WORD;
034
035    /**
036     * Constructs a tokenizer that splits on whitespace.
037     */
038    public WhitespaceTokenizer() {
039        super(whitespaceSplitCharacterFunction);
040    }
041    
042    @Override
043    public ConfiguredObjectProvenance getProvenance() {
044        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
045    }
046
047    @Override
048    public WhitespaceTokenizer clone() {
049        return new WhitespaceTokenizer();
050    }
051
052}