001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.options; 018 019import com.oracle.labs.mlrg.olcut.config.Option; 020import org.tribuo.util.tokens.Tokenizer; 021import org.tribuo.util.tokens.impl.NonTokenizer; 022import org.tribuo.util.tokens.impl.ShapeTokenizer; 023import org.tribuo.util.tokens.universal.UniversalTokenizer; 024 025import java.util.logging.Logger; 026 027/** 028 * CLI Options for all the tokenizers in the core package. 029 */ 030public class CoreTokenizerOptions implements TokenizerOptions { 031 032 private static final Logger logger = Logger.getLogger(CoreTokenizerOptions.class.getName()); 033 /** 034 * Options for the break iterator tokenizer. 035 */ 036 public BreakIteratorTokenizerOptions breakIteratorOptions; 037 /** 038 * Options for the split characters tokenizer. 039 */ 040 public SplitCharactersTokenizerOptions splitCharactersTokenizerOptions; 041 /** 042 * Options for the split pattern tokenizer. 043 */ 044 public SplitPatternTokenizerOptions splitPatternTokenizerOptions; 045 /** 046 * Type of tokenizer 047 */ 048 @Option(longName = "core-tokenizer-type", usage = "Type of tokenizer") 049 public CoreTokenizerType coreTokenizerType = CoreTokenizerType.SPLIT_CHARACTERS; 050 051 @Override 052 public Tokenizer getTokenizer() { 053 Tokenizer tokenizer; 054 logger.info("Using " + coreTokenizerType); 055 switch (coreTokenizerType) { 056 case BREAK_ITERATOR: 057 tokenizer = breakIteratorOptions.getTokenizer(); 058 break; 059 case SPLIT_CHARACTERS: 060 tokenizer = splitCharactersTokenizerOptions.getTokenizer(); 061 break; 062 case NON: 063 tokenizer = new NonTokenizer(); 064 break; 065 case SHAPE: 066 tokenizer = new ShapeTokenizer(); 067 break; 068 case SPLIT_PATTERN: 069 tokenizer = splitPatternTokenizerOptions.getTokenizer(); 070 break; 071 case UNIVERSAL: 072 tokenizer = new UniversalTokenizer(); 073 break; 074 default: 075 throw new IllegalArgumentException("Unknown tokenizer " + coreTokenizerType); 076 } 077 return tokenizer; 078 } 079 080 /** 081 * Tokenizer type. 082 */ 083 public enum CoreTokenizerType { 084 /** 085 * Creates a {@link org.tribuo.util.tokens.impl.BreakIteratorTokenizer}. 086 */ 087 BREAK_ITERATOR, 088 /** 089 * Creates a {@link org.tribuo.util.tokens.impl.SplitCharactersTokenizer}. 090 */ 091 SPLIT_CHARACTERS, 092 /** 093 * Creates a {@link NonTokenizer}. 094 */ 095 NON, 096 /** 097 * Creates a {@link ShapeTokenizer}. 098 */ 099 SHAPE, 100 /** 101 * Creates a {@link org.tribuo.util.tokens.impl.SplitPatternTokenizer}. 102 */ 103 SPLIT_PATTERN, 104 /** 105 * Creates a {@link UniversalTokenizer}. 106 */ 107 UNIVERSAL 108 } 109 110}