001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.impl;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.util.tokens.Token;
023import org.tribuo.util.tokens.Tokenizer;
024
025import java.text.BreakIterator;
026import java.util.Locale;
027
028/**
029 * A tokenizer wrapping a {@link BreakIterator} instance.
030 */
031public class BreakIteratorTokenizer implements Tokenizer {
032
033    @Config(mandatory = true, description="The locale language tag string.")
034    private String localeStr;
035
036    private Locale locale;
037
038    private BreakIterator breakIterator;
039
040    private CharSequence cs;
041
042    private int start;
043
044    private int startOffset;
045    private int endOffset;
046
047    private String token;
048
049    private boolean ready;
050
051    /**
052     * Default constructor for configuration system.
053     */
054    @SuppressWarnings("unused")
055    private BreakIteratorTokenizer() {}
056
057    /**
058     * Constructs a BreakIteratorTokenizer using the specified locale.
059     * @param locale The locale to use.
060     */
061    public BreakIteratorTokenizer(Locale locale) {
062        this.locale = locale;
063        this.localeStr = locale.toLanguageTag();
064        breakIterator = BreakIterator.getWordInstance(locale);
065        ready = false;
066        cs = null;
067    }
068
069    /**
070     * Used by the OLCUT configuration system, and should not be called by external code.
071     */
072    @Override
073    public void postConfig() {
074        locale = Locale.forLanguageTag(localeStr);
075        breakIterator = BreakIterator.getWordInstance(locale);
076        ready = false;
077        cs = null;
078    }
079
080    /**
081     * Returns the locale string this tokenizer uses.
082     * @return The locale string.
083     */
084    public String getLanguageTag() {
085        return localeStr;
086    }
087
088    @Override
089    public ConfiguredObjectProvenance getProvenance() {
090        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
091    }
092
093    @Override
094    public void reset(CharSequence cs) {
095        this.cs = cs;
096        breakIterator.setText(cs.toString());
097        start = breakIterator.first();
098        startOffset = -1;
099        endOffset = -1;
100        token = null;
101        ready = false;
102    }
103
104    @Override
105    public boolean advance() {
106        if (cs == null) {
107            throw new IllegalStateException("BreakIteratorTokenizer has not been reset.");
108        }
109        int end = breakIterator.next();
110        while (end != BreakIterator.DONE) {
111            token = cs.subSequence(start, end).toString();
112            startOffset = start;
113            endOffset = end;
114            start = end;
115            if (!token.trim().isEmpty()) {
116                ready = true;
117                return true;
118            } else {
119                end = breakIterator.next();
120            }
121        }
122
123        return false;
124    }
125
126    @Override
127    public String getText() {
128        if (ready) {
129            return token;
130        } else {
131            throw new IllegalStateException("BreakIteratorTokenizer is not ready.");
132        }
133    }
134
135    @Override
136    public int getStart() {
137        if (ready) {
138            return startOffset;
139        } else {
140            throw new IllegalStateException("BreakIteratorTokenizer is not ready.");
141        }
142    }
143
144    @Override
145    public int getEnd() {
146        if (ready) {
147            return endOffset;
148        } else {
149            throw new IllegalStateException("BreakIteratorTokenizer is not ready.");
150        }
151    }
152
153    @Override
154    public Token.TokenType getType() {
155        if (ready) {
156            return Token.TokenType.WORD;
157        } else {
158            throw new IllegalStateException("BreakIteratorTokenizer is not ready.");
159        }
160    }
161
162    @Override
163    public BreakIteratorTokenizer clone() {
164        try {
165            BreakIteratorTokenizer copy = (BreakIteratorTokenizer) super.clone();
166            copy.postConfig();
167            return copy;
168        } catch (CloneNotSupportedException e) {
169            throw new AssertionError("BreakIteratorTokenizer is Cloneable, but clone call failed");
170        }
171    }
172}
173