001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.universal; 018 019import org.tribuo.util.tokens.Token; 020 021/** 022 * A range currently being segmented. 023 */ 024public final class Range implements CharSequence { 025 /** 026 * The character buffer. 027 */ 028 public char[] buff = new char[16]; 029 /** 030 * The token length. 031 */ 032 public int len; 033 /** 034 * The start index. 035 */ 036 public int start; 037 /** 038 * The end index. 039 */ 040 public int end; 041 /** 042 * The value to increment by. 043 */ 044 public int incr; 045 /** 046 * The current token type. 047 */ 048 public Token.TokenType type; 049 050 Range() {} 051 052 /** 053 * Sets the first two characters in the range, and the type to NGRAM. 054 * @param c1 The first character. 055 * @param c2 The second character. 056 * @param start The start value. 057 */ 058 public void set(char c1, char c2, int start) { 059 buff[0] = c1; 060 buff[1] = c2; 061 this.start = start; 062 this.end = start + 2; 063 this.len = 2; 064 this.incr = 0; 065 this.type = Token.TokenType.NGRAM; 066 } 067 068 /** 069 * Sets the first character in the range. 070 * @param c The first character. 071 * @param start The start value. 072 */ 073 public void set(char c, int start) { 074 buff[0] = c; 075 this.start = start; 076 this.end = start + 1; 077 this.len = 1; 078 this.incr = 1; 079 this.type = Token.TokenType.WORD; 080 } 081 082 /** 083 * Sets the character range. 084 * @param buff The characters. 085 * @param len The length of the character buffer. 086 * @param start The start index. 087 */ 088 public void set(char[] buff, int len, int start) { 089 if (this.buff.length < buff.length) { 090 this.buff = new char[buff.length + 1]; 091 } 092 System.arraycopy(buff, 0, this.buff, 0, len); 093 this.len = len; 094 this.start = start; 095 this.end = start + len; 096 this.incr = 1; 097 this.type = Token.TokenType.WORD; 098 } 099 100 /** 101 * Sets this range to represent a punctuation character. 102 * @param p The punctuation character. 103 * @param start The start index. 104 */ 105 public void punct(char p, int start) { 106 buff[0] = p; 107 this.len = 1; 108 this.start = Math.max(start, 0); 109 this.end = this.start + 1; 110 this.incr = 0; 111 this.type = Token.TokenType.PUNCTUATION; 112 } 113 114 /** 115 * Sets the token type. 116 * @param type The token type. 117 */ 118 public void setType(Token.TokenType type) { 119 this.type = type; 120 } 121 122 @Override 123 public int length() { 124 return len; 125 } 126 127 @Override 128 public char charAt(int index) { 129 if (index < len) { 130 return buff[index]; 131 } 132 throw new IndexOutOfBoundsException(String.format("index %d exceeds length %d", index, len)); 133 } 134 135 @Override 136 public CharSequence subSequence(int start, int end) { 137 Range r = new Range(); 138 System.arraycopy(buff, start, r.buff, 0, end - start); 139 r.start = 0; 140 r.len = end - start; 141 r.end = r.len; 142 return r; 143 } 144 145 @Override 146 public String toString() { 147 return new String(buff, 0, len) + " " + type + " " + start + " " + end; 148 } 149}