001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.impl; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.util.tokens.Token; 023import org.tribuo.util.tokens.Tokenizer; 024 025import java.text.BreakIterator; 026import java.util.Locale; 027 028/** 029 * A tokenizer wrapping a {@link BreakIterator} instance. 030 */ 031public class BreakIteratorTokenizer implements Tokenizer { 032 033 @Config(mandatory = true, description="The locale language tag string.") 034 private String localeStr; 035 036 private Locale locale; 037 038 private BreakIterator breakIterator; 039 040 private CharSequence cs; 041 042 private int start; 043 044 private int startOffset; 045 private int endOffset; 046 047 private String token; 048 049 private boolean ready; 050 051 /** 052 * Default constructor for configuration system. 053 */ 054 @SuppressWarnings("unused") 055 private BreakIteratorTokenizer() {} 056 057 /** 058 * Constructs a BreakIteratorTokenizer using the specified locale. 059 * @param locale The locale to use. 060 */ 061 public BreakIteratorTokenizer(Locale locale) { 062 this.locale = locale; 063 this.localeStr = locale.toLanguageTag(); 064 breakIterator = BreakIterator.getWordInstance(locale); 065 ready = false; 066 cs = null; 067 } 068 069 /** 070 * Used by the OLCUT configuration system, and should not be called by external code. 071 */ 072 @Override 073 public void postConfig() { 074 locale = Locale.forLanguageTag(localeStr); 075 breakIterator = BreakIterator.getWordInstance(locale); 076 ready = false; 077 cs = null; 078 } 079 080 /** 081 * Returns the locale string this tokenizer uses. 082 * @return The locale string. 083 */ 084 public String getLanguageTag() { 085 return localeStr; 086 } 087 088 @Override 089 public ConfiguredObjectProvenance getProvenance() { 090 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 091 } 092 093 @Override 094 public void reset(CharSequence cs) { 095 this.cs = cs; 096 breakIterator.setText(cs.toString()); 097 start = breakIterator.first(); 098 startOffset = -1; 099 endOffset = -1; 100 token = null; 101 ready = false; 102 } 103 104 @Override 105 public boolean advance() { 106 if (cs == null) { 107 throw new IllegalStateException("BreakIteratorTokenizer has not been reset."); 108 } 109 int end = breakIterator.next(); 110 while (end != BreakIterator.DONE) { 111 token = cs.subSequence(start, end).toString(); 112 startOffset = start; 113 endOffset = end; 114 start = end; 115 if (!token.trim().isEmpty()) { 116 ready = true; 117 return true; 118 } else { 119 end = breakIterator.next(); 120 } 121 } 122 123 return false; 124 } 125 126 @Override 127 public String getText() { 128 if (ready) { 129 return token; 130 } else { 131 throw new IllegalStateException("BreakIteratorTokenizer is not ready."); 132 } 133 } 134 135 @Override 136 public int getStart() { 137 if (ready) { 138 return startOffset; 139 } else { 140 throw new IllegalStateException("BreakIteratorTokenizer is not ready."); 141 } 142 } 143 144 @Override 145 public int getEnd() { 146 if (ready) { 147 return endOffset; 148 } else { 149 throw new IllegalStateException("BreakIteratorTokenizer is not ready."); 150 } 151 } 152 153 @Override 154 public Token.TokenType getType() { 155 if (ready) { 156 return Token.TokenType.WORD; 157 } else { 158 throw new IllegalStateException("BreakIteratorTokenizer is not ready."); 159 } 160 } 161 162 @Override 163 public BreakIteratorTokenizer clone() { 164 try { 165 BreakIteratorTokenizer copy = (BreakIteratorTokenizer) super.clone(); 166 copy.postConfig(); 167 return copy; 168 } catch (CloneNotSupportedException e) { 169 throw new AssertionError("BreakIteratorTokenizer is Cloneable, but clone call failed"); 170 } 171 } 172} 173