001/* 002 * Copyright c 2018 Rusi Popov, MDA Tools.net All rights reserved. 003 * 004 * This program and the accompanying materials are made available under the terms of the 005 * Eclipse Public License v2.0 which accompanies this distribution, and is available at 006 * http://www.eclipse.org/legal/epl-v20.html 007 */ 008package net.mdatools.modelant.core.util; 009 010import java.util.ArrayList; 011import java.util.Iterator; 012import java.util.List; 013 014/** 015 * This class is a parser of identifiers used to split them into a sequence of words that they are 016 * composed of. The words are identified when they are terminated by non-alphanumeric characters or 017 * by changing the letter's case. It is useful when parsing the words that are included in for 018 * example in Java identifiers. This class will parse its name NameTokenizer into 2 words: "Name" 019 * and "Tokenizer". The words identified start with a letter and contain letter of digit. Any 020 * non-alphanumeric characters are skipped and used only to terminate words. No empty words are 021 * identified. Rules to identify words: 022 * <ul> 023 * <li>Words can consist all of lower case letters 024 * <li>Words can start with 1 upper case letter and continue in lower case 025 * <li>Words starting with more than 1 upper case letter are split into two words: 026 * <ul> 027 * <li>a word all in upper case 028 * <li>a next word starting with exactly one letter in upper case and all other in lower case (at 029 * least one exists) 030 * </ul> 031 * <li>Words that contain digits are terminated before the first non-digit letter after the digits 032 * <li>Any leading digits are skipped 033 * </ul> 034 * @author rpopov 035 */ 036public class NameTokenizer implements Iterator<String> { 037 /** 038 * Internal states of the name parser. 039 */ 040 private static final int SEPARATOR_STATE = 0; 041 042 private static final int WORD_WITH_FIRST_UPPER_STATE = 1; 043 044 private static final int WORD_IN_LOWER_CASE_STATE = 2; 045 046 private static final int WORD_WITH_DIGITS_STATE = 3; 047 048 /** 049 * Here are stored the identified words in the order they occure 050 */ 051 private final Iterator<String> wordsIterator; 052 053 054 /** 055 * This is the only constructor of the name tokenizer - it prepares it for parsing the source 056 * string/identifier. 057 * 058 * @param source is the identifier or general text to be parsed 059 */ 060 public NameTokenizer(String source) { 061 this.wordsIterator = parse( source ).iterator(); 062 } 063 064 065 /** 066 * This method parses the source identifier/text and returns a list of words identified. The word 067 * separators are: 068 * <ul> 069 * <li>Changes to Upper case all first letters of the words in it 070 * <li>Removes all spaces, _ ,$,\t\n\r\f etc 071 * </ul> 072 * Rules to identify words: 073 * <ul> 074 * <li>Words can consist all of lower case letters 075 * <li>Words can start with 1 upper case letter and continue in lower case 076 * <li>Words starting with more than 1 upper case letter are split into two words: 077 * <ul> 078 * <li>a word all in upper case 079 * <li>a next word starting with exactly one letter in upper case and all other in lower case (at 080 * least one exists) 081 * </ul> 082 * <li>Words that contain digits are terminated before the first non-digit letter after the 083 * digits 084 * <li>Any leading digits are skipped 085 * </ul> 086 * 087 * @param source to format a class name from 088 * @return a list of the words identified 089 */ 090 private static List<String> parse(String source) { 091 List<String> result; 092 int len; 093 StringBuilder word; 094 int i = 0; 095 char current; 096 int state; 097 098 result = new ArrayList<>(); 099 100 len = source.length(); 101 word = new StringBuilder( len ); 102 i = 0; 103 state = SEPARATOR_STATE; 104 105 // parses the name char-by-char and add the words identified in result list 106 while ( i < len ) { 107 current = source.charAt( i++ ); 108 109 switch ( state ) { 110 case SEPARATOR_STATE: // only separators have been parsed (if any) 111 if ( Character.isLetter( current ) ) { // Class name may start only with a letter 112 word.append( current ); 113 114 if ( Character.isUpperCase( current ) ) { // a word started in upper case 115 state = WORD_WITH_FIRST_UPPER_STATE; 116 117 } else { // a word started in lower case 118 state = WORD_IN_LOWER_CASE_STATE; 119 } 120 } else if ( Character.isDigit( current ) && result.size() > 0 ) { // a word started with a digit, but not the first word in the identifier 121 word.append( current ); 122 state = WORD_WITH_DIGITS_STATE; 123 124 } // any word separators are skipped 125 break; 126 127 case WORD_WITH_FIRST_UPPER_STATE: // the first word letter has been added in capital 128 if ( Character.isLetterOrDigit( current ) ) { 129 if ( Character.isLowerCase( current ) ) { // may be this lower case letter is the last 130 // of this word 131 word.append( current ); 132 state = WORD_IN_LOWER_CASE_STATE; 133 134 } else if ( Character.isDigit( current ) ) { // the current word continues with digits 135 word.append( current ); 136 state = WORD_WITH_DIGITS_STATE; 137 138 } else { // the second letter of this word is in upper case 139 140 if ( i < len && Character.isLowerCase( source.charAt( i ) ) ) { // a new word with 141 // first capital and 142 // next lower 143 // starts with the current letter if it is capital 144 // complete this word 145 result.add( word.toString() ); 146 147 // construct a new word starting from this - it is with first capital (and next 148 // lower) case letters 149 word.setLength( 0 ); 150 word.append( current ); 151 152 } else { // this word is (by now) all in capital 153 word.append( current ); 154 } 155 } 156 } else { //not a letter or digit recognized - this is a terminator 157 // complete this word 158 result.add( word.toString() ); 159 160 // construct an empty new word starting from this 161 word.setLength( 0 ); 162 state = SEPARATOR_STATE; 163 } 164 break; 165 166 case WORD_IN_LOWER_CASE_STATE: // any upper case letter will be treated as a new word start 167 if ( Character.isLowerCase( current ) ) { // the same word continues 168 word.append( current ); 169 170 } else if ( Character.isDigit( current ) ) { // the current word continues with digits 171 word.append( current ); 172 state = WORD_WITH_DIGITS_STATE; 173 174 } else { // this word terminates 175 // store the current word 176 result.add( word.toString() ); 177 178 // start the new word 179 word.setLength( 0 ); 180 181 if ( Character.isUpperCase( current ) ) { // a new word started 182 word.append( current ); 183 state = WORD_WITH_FIRST_UPPER_STATE; 184 } else { // a terminator found 185 state = SEPARATOR_STATE; 186 } 187 } 188 break; 189 case WORD_WITH_DIGITS_STATE: // any non-digit letter will be treated as a new word start 190 if ( Character.isDigit( current ) ) { // the same word continues 191 word.append( current ); 192 } else { 193 // store the current word 194 result.add( word.toString() ); 195 196 // start the new word 197 word.setLength( 0 ); 198 199 if ( Character.isLetter( current ) ) { // a new word begins 200 word.append( current ); 201 202 if ( Character.isUpperCase( current ) ) { // a new word started in upper case 203 state = WORD_WITH_FIRST_UPPER_STATE; 204 205 } else { // a word started in lower case 206 state = WORD_IN_LOWER_CASE_STATE; 207 } 208 } else { // non-alphanumeric letters are treated as separators 209 state = SEPARATOR_STATE; 210 } 211 } 212 break; 213 } 214 } 215 // store the last word parsed 216 if ( word.length() > 0 ) { 217 result.add( word.toString() ); 218 } 219 return result; 220 } 221 222 223 /** 224 * This method removes the word just retrieved by next(). Its usage in the context of this class 225 * seems to be obsolete. It is provided for completeness. 226 * 227 * @see java.util.Iterator#remove() 228 */ 229 public void remove() { 230 wordsIterator.remove(); 231 } 232 233 234 /** 235 * This method checks if there is a following word identified. 236 * 237 * @return true is one exists 238 * @see java.util.Iterator#hasNext() 239 */ 240 public boolean hasNext() { 241 return wordsIterator.hasNext(); 242 } 243 244 245 /** 246 * This method returns the next word identified if one exists. The object it returns is a string. 247 * 248 * @return a string representing the next word identified 249 * @see java.util.Iterator#next() 250 */ 251 public String next() { 252 return wordsIterator.next(); 253 } 254}