001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.options;
018
019import com.oracle.labs.mlrg.olcut.config.Option;
020import org.tribuo.util.tokens.Tokenizer;
021import org.tribuo.util.tokens.impl.NonTokenizer;
022import org.tribuo.util.tokens.impl.ShapeTokenizer;
023import org.tribuo.util.tokens.universal.UniversalTokenizer;
024
025import java.util.logging.Logger;
026
027/**
028 * CLI Options for all the tokenizers in the core package.
029 */
030public class CoreTokenizerOptions implements TokenizerOptions {
031
032    private static final Logger logger = Logger.getLogger(CoreTokenizerOptions.class.getName());
033    /**
034     * Options for the break iterator tokenizer.
035     */
036    public BreakIteratorTokenizerOptions breakIteratorOptions;
037    /**
038     * Options for the split characters tokenizer.
039     */
040    public SplitCharactersTokenizerOptions splitCharactersTokenizerOptions;
041    /**
042     * Options for the split pattern tokenizer.
043     */
044    public SplitPatternTokenizerOptions splitPatternTokenizerOptions;
045    /**
046     * Type of tokenizer
047     */
048    @Option(longName = "core-tokenizer-type", usage = "Type of tokenizer")
049    public CoreTokenizerType coreTokenizerType = CoreTokenizerType.SPLIT_CHARACTERS;
050
051    @Override
052    public Tokenizer getTokenizer() {
053        Tokenizer tokenizer;
054        logger.info("Using " + coreTokenizerType);
055        switch (coreTokenizerType) {
056            case BREAK_ITERATOR:
057                tokenizer = breakIteratorOptions.getTokenizer();
058                break;
059            case SPLIT_CHARACTERS:
060                tokenizer = splitCharactersTokenizerOptions.getTokenizer();
061                break;
062            case NON:
063                tokenizer = new NonTokenizer();
064                break;
065            case SHAPE:
066                tokenizer = new ShapeTokenizer();
067                break;
068            case SPLIT_PATTERN:
069                tokenizer = splitPatternTokenizerOptions.getTokenizer();
070                break;
071            case UNIVERSAL:
072                tokenizer = new UniversalTokenizer();
073                break;
074            default:
075                throw new IllegalArgumentException("Unknown tokenizer " + coreTokenizerType);
076        }
077        return tokenizer;
078    }
079
080    /**
081     * Tokenizer type.
082     */
083    public enum CoreTokenizerType {
084        /**
085         * Creates a {@link org.tribuo.util.tokens.impl.BreakIteratorTokenizer}.
086         */
087        BREAK_ITERATOR,
088        /**
089         * Creates a {@link org.tribuo.util.tokens.impl.SplitCharactersTokenizer}.
090         */
091        SPLIT_CHARACTERS,
092        /**
093         * Creates a {@link NonTokenizer}.
094         */
095        NON,
096        /**
097         * Creates a {@link ShapeTokenizer}.
098         */
099        SHAPE,
100        /**
101         * Creates a {@link org.tribuo.util.tokens.impl.SplitPatternTokenizer}.
102         */
103        SPLIT_PATTERN,
104        /**
105         * Creates a {@link UniversalTokenizer}.
106         */
107        UNIVERSAL
108    }
109
110}