001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.universal;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.util.tokens.Token;
023import org.tribuo.util.tokens.Tokenizer;
024
025import java.util.Arrays;
026import java.util.LinkedList;
027import java.util.Queue;
028
029/**
030 * This class was originally written for the purpose of document indexing in an
031 * information retrieval context (principally used in Sun Labs' Minion search
032 * engine). It was refactored here to implement the Tokenizer interface taking
033 * care that the the 'ngram' tokens had correct character offsets. This is
034 * typically not required in the document indexing context - but is essential
035 * in other kinds of text processing / NLP tasks.
036 * <p>
037 * This tokenizer has some specific behavior in how it handles "ngram"
038 * characters - i.e., those characters for which {@link #isNgram(char)} returns
039 * true (CJK characters and others). For these characters, it will generate
040 * tokens corresponding to character bigrams in addition to tokens corresponding
041 * to token unigrams. Most of the other tokenizers will generate tokens that
042 * have no overlapping spans but here the character bigram tokens will overlap
043 * with the character unigram tokens.
044 * <p>
045 * This tokenizer uses bigram tokenization whenever it encounters 'ngram'
046 * characters in the CJK range (among others see {@link #isNgram(char)}). It
047 * otherwise tokenizes using punctuation and whitespace separators to separate
048 * words. Within runs of 'ngram' characters the tokenizer will generate tokens
049 * corresponding to two adjacent characters in addition to tokens corresponding
050 * to each character. The tokens corresponding to character bigrams may overlap
051 * with the previous and next token. An end-of-line between two 'ngram'
052 * characters is ignored (i.e., a character bigram token will be created.)
053 * <p>
054 * For example, a sequence of three Chinese characters, 非常感, would tokenize as
055 * three WORD type tokens: 非, 常, and 感 and two NGRAM type tokens: 非常 and 常感.
056 * Here these tokens will have character offsets that correspond to the
057 * character offsets into the text. Here are the tokens listed with their
058 * character offsets:
059 * <ul>
060 * <li>非[0,1]</li>
061 * <li>非常[0,2]</li>
062 * <li>常[1,2]</li>
063 * <li>常感[1,3]</li>
064 * <li>感[2,3]</li>
065 * </ul>
066 */
067public class UniversalTokenizer implements Tokenizer {
068
069    /**
070     * The length of the longest token that we will generate.
071     */
072    protected int maxTokenLength = 256;
073    private boolean eofReached = false;
074
075    /**
076     * The character position in the character sequence that we're tokenizing.
077     */
078    private int pos;
079
080    /**
081     * The starting offset of the current buffer in the token stream.
082     */
083    private int start;
084
085    /**
086     * If <code>true</code> then unigrams will be generated for each n-gram
087     * character.
088     */
089    private boolean generateUnigrams = true;
090
091    /**
092     * If <code>true</code> then character bigrams will be generated for each n-gram
093     * character as defined by {@link #isNgram(char)}.
094     */
095    private boolean generateNgrams = true;
096    /**
097     * The state of the tokenizer determined by previous history.
098     */
099    private State state;
100    /**
101     * The character sequence that we're currently processing.
102     */
103    private CharSequence cs;
104    /**
105     * The token that we're building.
106     */
107    private char[] buffer;
108    /**
109     * A string representation of the current token.
110     */
111    private String currToken;
112    /**
113     * The current type of the token.
114     */
115    private Token.TokenType currType;
116    /**
117     * The current word position of the token.
118     */
119    private int currPos;
120    /**
121     * The starting offset of the current token.
122     */
123    private int startOffset;
124    /**
125     * The ending offset of the current token.
126     */
127    private int endOffset;
128    /**
129     * The length of the current token we're building.
130     */
131    private int tokenLength;
132    /**
133     * Whether this is the first token.
134     */
135    private boolean firstToken;
136    /**
137     * Is the tokenizer ready?
138     */
139    private boolean ready;
140    @Config(description="Send punctuation through as tokens.")
141    private boolean sendPunct = false;
142    /**
143     * A set of tokens that were generated and need to be returned.
144     */
145    private Queue<Range> queuedTokens;
146    private Queue<Range> pool;
147    /**
148     * The current character that we're processing.
149     */
150    private char c;
151
152    /**
153     * Constructs a universal tokenizer.
154     * @param sendPunct if sendPunct is true, then the tokenizer will generate punctuation tokens.
155     */
156    public UniversalTokenizer(boolean sendPunct) {
157        super();
158        this.sendPunct = sendPunct;
159        this.buffer = new char[maxTokenLength];
160        this.tokenLength = 0;
161        this.state = State.SKIPPING;
162        this.queuedTokens = new LinkedList<>();
163        this.pool = new LinkedList<>();
164    }
165
166    /**
167     * Constructs a universal tokenizer which doesn't send punctuation.
168     */
169    public UniversalTokenizer() {
170        this(false);
171    }
172
173    /**
174     * A quick check for whether a character should be kept in a word or should
175     * be removed from the word if it occurs at one of the ends. An
176     * approximation of Character.isLetterOrDigit, but is faster and more
177     * correct, since it doesn't count the smart quotes as letters.
178     *
179     * @param c The character to check.
180     * @return True if the input is a letter or digit.
181     */
182    public static boolean isLetterOrDigit(char c) {
183        if ((c <= 122 && c >= 97)
184                || // most frequent: lowercase a...z
185                (c <= 90 && c >= 65)
186                || // frequent: uppercase A...Z
187                (c <= 57 && c >= 48) // frequent: numbers 0...9
188        ) {
189            return true;
190        } else if ((c <= 96)
191                || // includes whitespace
192                (c == 210 || c == 211)
193                || // (smart quotes)
194                (c >= 123 && c <= 127) // {|}~DEL
195        ) {
196            return false;
197        } else if ((c >= 3021 && c <= 3029)
198                || // Hangzhou-style numerals
199                (c >= 65 && c <= 90)
200                || // frequent: uppercase A...Z
201                (c >= 48 && c <= 57) // frequent: numbers 0...9
202        ) {
203            return true;
204        } else {
205            return Character.isLetterOrDigit(c);
206        }
207    }
208
209    /**
210     * A quick check for whether a character is a digit.
211     *
212     * @param c The character to check
213     * @return True if the input is a digit.
214     */
215    public static boolean isDigit(char c) {
216        if ((c <= 57 && c >= 48) // most frequent: ASCII numbers 0...9
217        ) {
218            return true;
219        } else if (c <= 255) {
220            return false;
221        } else {
222            return Character.isDigit(c);
223        }
224    }
225
226    /**
227     * A quick check for whether a character is whitespace.
228     *
229     * @param c The character to check
230     * @return True if the input is a whitespace character.
231     */
232    public static boolean isWhitespace(char c) {
233        //test for white space
234        if ((c == 32)
235                || // Space
236                (c <= 13 && c >= 9)
237                || // Tab, Linefeed, PageUp, Page, Return
238                (c <= 4 && c >= 1) // STX, SOT, ETX (Enter), EOT
239        ) {
240            return true;
241        } else if (c <= 255) {
242            return false;
243        } else {
244            return Character.isWhitespace(c);
245        }
246    }
247
248    /**
249     * A quick check for a character in a language that may not separate words
250     * with whitespace (includes Arabic, CJK, and Thai). Uses Unicode Standard
251     * Version 2.0.
252     *
253     * @param c The character to check
254     * @return True if the input character is in a region which is not whitespace separated.
255     */
256    public static boolean isNgram(char c) {
257        // Test for characters that may not separate words with white
258        // space and therefore require bigram treatment.
259        // Uses Unicode Standard Version 2.0.
260        if (c > '\u3002' && c <= '\uD7FF') {           // (CJK Characters)
261            return (c < '\u3040' || c > '\u30FF');   // - Hiragana and Katakana
262        } else if ((c >= '\u0600' && c <= '\u06FF') || // (Arabic)
263                (c >= '\uF900' && c <= '\uFAFF') || // (CJK Compatibility Ideographs)
264                (c >= '\u1100' && c <= '\u11FF') || // (Hangul Jamo)
265                (c >= '\uFB50' && c <= '\uFE2F') || // (Arabic Presentation Forms-A)
266                (c >= '\uFE30' && c <= '\uFE4F') || // (CJK Compatibility Forms)
267                (c >= '\uFE70' && c <= '\uFEFF') || // (Arabic Presentation Forms-B)
268                (c >= '\uFF60' && c <= '\uFFDF') || // (CJK Half Width Forms)
269                (c >= '\u0E00' && c <= '\u0E7F') || // (Thai)
270                (c >= '\u0E80' && c <= '\u0EFF') || // (Lao)
271                (c >= '\u0F00' && c <= '\u0FBF') || // (Tibetan)
272                (c >= '\u0B80' && c <= '\u0BFF') || // (Tamil)
273                (c >= '\u0C00' && c <= '\u0C7F') || // (Telugu)
274                (c >= '\u0C80' && c <= '\u0CFF') || // (Kannada)
275                (c >= '\u0D00' && c <= '\u0D7F') || // (Malayalam)
276                (c >= '\u10A0' && c <= '\u10FF')) { // (Georgian)
277            return true;
278        } else {
279            return false;
280        }
281    }
282
283    /**
284     * Does this tokenizer generate unigrams?
285     * @return True if the tokenizer generates unigram tokens.
286     */
287    public boolean isGenerateUnigrams() {
288        return generateUnigrams;
289    }
290
291    /**
292     * Controls if the tokenizer generates unigrams.
293     * @param generateUnigrams If true generates unigram tokens.
294     */
295    public void setGenerateUnigrams(boolean generateUnigrams) {
296        this.generateUnigrams = generateUnigrams;
297    }
298
299    /**
300     * Does this tokenizer generate ngrams?
301     * @return True if the tokenizer generates ngram tokens.
302     */
303    public boolean isGenerateNgrams() {
304        return generateNgrams;
305    }
306
307    /**
308     * Controls if the tokenizer generates ngrams.
309     * @param generateNgrams If true generates ngram tokens.
310     */
311    public void setGenerateNgrams(boolean generateNgrams) {
312        this.generateNgrams = generateNgrams;
313    }
314
315    /**
316     * Returns the maximum token length this tokenizer will generate.
317     * @return The maximum token length.
318     */
319    public int getMaxTokenLength() {
320        return maxTokenLength;
321    }
322
323    /**
324     * Sets the maximum token length this tokenizer will generate.
325     * @param maxTokenLength The maximum token length.
326     */
327    public void setMaxTokenLength(int maxTokenLength) {
328        this.maxTokenLength = maxTokenLength;
329    }
330
331    @Override
332    public ConfiguredObjectProvenance getProvenance() {
333        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
334    }
335
336    @Override
337    public final boolean advance() {
338        if (cs == null) {
339            throw new IllegalStateException("UniversalTokenizer has not been reset.");
340        }
341        //
342        // Do we have tokens queued up to go?
343        if (queuedTokens.size() > 0) {
344            handleQueued();
345            return true;
346        }
347
348        //
349        // If we've already read the data, then we're done.
350        if (eofReached) {
351            return false;
352        }
353
354        //
355        // Read characters until we have one or more tokens to send.
356        while (pos < cs.length()) {
357            c = cs.charAt(pos);
358            handleChar();
359            pos++;
360            if (queuedTokens.size() > 0) {
361                handleQueued();
362                return true;
363            }
364        }
365
366        eofReached = true;
367        makeTokens();
368        if (queuedTokens.size() > 0) {
369            handleQueued();
370            return true;
371        }
372        return false;
373    }
374
375    private void handleQueued() {
376        ready = true;
377        Range range = queuedTokens.poll();
378        currToken = new String(range.buff, 0, range.len);
379        startOffset = range.start;
380        endOffset = range.end;
381        if (firstToken && range.incr == 0) {
382            range.incr = 1;
383            firstToken = false;
384        }
385        currType = range.type;
386        currPos = range.incr;
387        pool.offer(range);
388    }
389
390    /**
391     * Handle a character to add to the token buffer.
392     */
393    protected void handleChar() {
394
395        //
396        // ASCII characters.
397        if ((c >= 97 && c <= 122) || (c >= 65 && c <= 90)) {
398            if (state == State.NGRAM) {
399                makeTokens();
400            }
401            addChar();
402            state = State.COLLECTING;
403            return;
404        }
405
406        //
407        // ASCII space. We need to treat other whitespace differently, depending
408        // on whether we're ngram tokenizing.
409        if (c == 32) {
410            switch (state) {
411                case COLLECTING:
412                case NGRAM:
413                    // The transition from collecting or n-gram to whitespace
414                    // causes us to emit tokens.
415                    makeTokens();
416                    break;
417                case SKIPPING:
418                    break;
419                default:
420                    break;
421            }
422            sendPunct();
423            state = State.SKIPPING;
424            return;
425        }
426
427        if (isNgram(c)) {
428            // CJK characters (Chinese, Japanese, Korean)
429            // to be tokenized with bigram tokens.
430            // (Put this test here so these languages will tokenize
431            // more efficiently and it doesn't cost much for the non CJK
432            // languages.)
433
434            switch (state) {
435                case SKIPPING:
436                    state = State.NGRAM;
437                    break;
438                case COLLECTING:
439                    makeTokens();
440                    state = State.NGRAM;
441                    break;
442                case NGRAM:
443                    break;
444                default:
445                    break;
446            }
447            addChar();
448            return;
449        }
450
451        if (c == 0 || (state == State.NGRAM && (c >= 10 && c <= 13))) {
452            // While processing ngram character regions, Linefeed, PageUp, Page, Return
453            // don't do anything, so just return.
454            return;
455        }
456
457        if (isWhitespace(c)) {
458            // The rest of the white space characters for break:
459            switch (state) {
460                case COLLECTING:
461                case NGRAM:
462                    // The transition from collecting to whitespace
463                    // causes us to emit tokens.
464                    makeTokens();
465                    break;
466                case SKIPPING:
467                    break;
468                default:
469                    break;
470            }
471            sendPunct();
472            state = State.SKIPPING;
473            return;
474        }
475
476        if ((c >= 48 && c <= 57) || (c > 255 && Character.isDigit(c))) {
477
478            //
479            // The digits.
480            switch (state) {
481                case SKIPPING:
482                    state = State.COLLECTING;
483                    break;
484                case NGRAM:
485                    makeTokens();
486                    state = State.COLLECTING;
487                    break;
488                case COLLECTING:
489                    break;
490                default:
491                    break;
492            }
493            addChar();
494            return;
495        }
496
497        //
498        // Any other letter or digit.
499        if (isLetterOrDigit(c)) {
500            if (state == State.NGRAM) {
501                makeTokens();
502            }
503            addChar();
504            state = State.COLLECTING;
505            return;
506        }
507
508        // Anything other than the above cases, we break.
509        if (state != State.SKIPPING) {
510            makeTokens();
511        }
512        sendPunct();
513        state = State.SKIPPING;
514    }
515
516    private void sendPunct() {
517        if (sendPunct && !isWhitespace(c)) {
518            Range r = getRange();
519            r.punct(c, pos);
520            queuedTokens.add(r);
521        }
522    }
523
524    /**
525     * Add a character to the buffer that we're building for a token.
526     */
527    protected void addChar() {
528
529        //
530        // First see if token buffer needs to be expanded.
531        // Note: tokLen points to the next unused slot in token.
532        if (buffer.length <= tokenLength) {
533            buffer = Arrays.copyOf(buffer, tokenLength + 32);
534        }
535
536        if (tokenLength == 0) {
537            start = pos;
538        }
539        buffer[tokenLength++] = c;
540
541        if (tokenLength >= maxTokenLength) {
542            makeTokens();
543        }
544    }
545
546    @Override
547    public int getStart() {
548        if (ready) {
549            return startOffset;
550        } else {
551            throw new IllegalStateException("UniversalTokenizer is not ready.");
552        }
553    }
554
555    @Override
556    public int getEnd() {
557        if (ready) {
558            return endOffset;
559        } else {
560            throw new IllegalStateException("UniversalTokenizer is not ready.");
561        }
562    }
563
564    @Override
565    public String getText() {
566        if (ready) {
567            return currToken;
568        } else {
569            throw new IllegalStateException("UniversalTokenizer is not ready.");
570        }
571    }
572
573    @Override
574    public Token.TokenType getType() {
575        if (ready) {
576            return currType;
577        } else {
578            throw new IllegalStateException("UniversalTokenizer is not ready.");
579        }
580    }
581
582    /**
583     * Gets the current position in the input.
584     * @return The current position.
585     */
586    public int getPos() {
587        return currPos;
588    }
589
590    @Override
591    public Tokenizer clone() {
592        try {
593            UniversalTokenizer copy = (UniversalTokenizer) super.clone();
594
595            copy.buffer = new char[maxTokenLength];
596            copy.tokenLength = 0;
597            copy.state = State.SKIPPING;
598            copy.pool = new LinkedList<>();
599            copy.queuedTokens = new LinkedList<>();
600            copy.currToken = null;
601            copy.ready = false;
602            copy.cs = null;
603
604            return copy;
605        } catch (CloneNotSupportedException e) {
606            throw new AssertionError("UniversalTokenizer is Cloneable, but clone call failed");
607        }
608    }
609
610    /**
611     * Reset state of tokenizer to clean slate.
612     */
613    @Override
614    public void reset(CharSequence cs) {
615        this.cs = cs;
616        pos = 0;
617        tokenLength = 0;
618        start = -1;
619        state = State.SKIPPING;
620        eofReached = false;
621        firstToken = true;
622        c = 0;
623        startOffset = -1;
624        endOffset = -1;
625        currToken = null;
626        ready = false;
627    }
628
629    private Range getRange() {
630        if (pool.isEmpty()) {
631            return new Range();
632        }
633        return pool.remove();
634    }
635
636    /**
637     * Make one or more tokens from our current collected characters.
638     */
639    protected void makeTokens() {
640
641        //
642        // Don't generate empty tokens.
643        if (tokenLength <= 0) {
644            return;
645        }
646
647        if (state == State.NGRAM) {
648            // if we only have one character, then just generate a single
649            // token and be done.
650            if (tokenLength == 1) {
651                Range range = getRange();
652                range.set(buffer[0], start);
653                queuedTokens.add(range);
654                tokenLength = 0;
655                return;
656            }
657
658            for (int i = 0; i < tokenLength; i++) {
659                if (generateUnigrams) {
660                    // Generate a unigram for this character.
661                    Range range = getRange();
662                    range.set(buffer[i], start + i);
663                    queuedTokens.add(range);
664                }
665                if (generateNgrams && i < tokenLength - 1) {
666                    // Generate a bigram for this character.
667                    Range range = getRange();
668                    range.set(buffer[i], buffer[i + 1], start + i);
669                    queuedTokens.add(range);
670                }
671            }
672        } else {
673            // Generate one token from the buffer.
674            Range range = getRange();
675            range.set(buffer, tokenLength, start);
676            queuedTokens.add(range);
677        }
678        tokenLength = 0;
679    }
680
681    private enum State {
682        SKIPPING,
683        COLLECTING,
684        NGRAM,
685    }
686
687}