001/*
002 * Copyright c 2018 Rusi Popov, MDA Tools.net All rights reserved.
003 *
004 * This program and the accompanying materials are made available under the terms of the
005 * Eclipse Public License v2.0 which accompanies this distribution, and is available at
006 * http://www.eclipse.org/legal/epl-v20.html
007 */
008package net.mdatools.modelant.core.util;
009
010import java.util.ArrayList;
011import java.util.Iterator;
012import java.util.List;
013
014/**
015 * This class is a parser of identifiers used to split them into a sequence of words that they are
016 * composed of. The words are identified when they are terminated by non-alphanumeric characters or
017 * by changing the letter's case. It is useful when parsing the words that are included in for
018 * example in Java identifiers. This class will parse its name NameTokenizer into 2 words: "Name"
019 * and "Tokenizer". The words identified start with a letter and contain letter of digit. Any
020 * non-alphanumeric characters are skipped and used only to terminate words. No empty words are
021 * identified. Rules to identify words:
022 * <ul>
023 * <li>Words can consist all of lower case letters
024 * <li>Words can start with 1 upper case letter and continue in lower case
025 * <li>Words starting with more than 1 upper case letter are split into two words:
026 * <ul>
027 * <li>a word all in upper case
028 * <li>a next word starting with exactly one letter in upper case and all other in lower case (at
029 * least one exists)
030 * </ul>
031 * <li>Words that contain digits are terminated before the first non-digit letter after the digits
032 * <li>Any leading digits are skipped
033 * </ul>
034 * @author rpopov
035 */
036public class NameTokenizer implements Iterator<String> {
037  /**
038   * Internal states of the name parser.
039   */
040  private static final int SEPARATOR_STATE = 0;
041
042  private static final int WORD_WITH_FIRST_UPPER_STATE = 1;
043
044  private static final int WORD_IN_LOWER_CASE_STATE = 2;
045
046  private static final int WORD_WITH_DIGITS_STATE = 3;
047
048  /**
049   * Here are stored the identified words in the order they occure
050   */
051  private final Iterator<String> wordsIterator;
052
053
054  /**
055   * This is the only constructor of the name tokenizer - it prepares it for parsing the source
056   * string/identifier.
057   *
058   * @param source is the identifier or general text to be parsed
059   */
060  public NameTokenizer(String source) {
061    this.wordsIterator = parse( source ).iterator();
062  }
063
064
065  /**
066   * This method parses the source identifier/text and returns a list of words identified. The word
067   * separators are:
068   * <ul>
069   * <li>Changes to Upper case all first letters of the words in it
070   * <li>Removes all spaces, _ ,$,\t\n\r\f etc
071   * </ul>
072   * Rules to identify words:
073   * <ul>
074   * <li>Words can consist all of lower case letters
075   * <li>Words can start with 1 upper case letter and continue in lower case
076   * <li>Words starting with more than 1 upper case letter are split into two words:
077   * <ul>
078   * <li>a word all in upper case
079   * <li>a next word starting with exactly one letter in upper case and all other in lower case (at
080   * least one exists)
081   * </ul>
082   * <li>Words that contain digits are terminated before the first non-digit letter after the
083   * digits
084   * <li>Any leading digits are skipped
085   * </ul>
086   *
087   * @param source to format a class name from
088   * @return a list of the words identified
089   */
090  private static List<String> parse(String source) {
091    List<String> result;
092    int len;
093    StringBuilder word;
094    int i = 0;
095    char current;
096    int state;
097
098    result = new ArrayList<>();
099
100    len = source.length();
101    word = new StringBuilder( len );
102    i = 0;
103    state = SEPARATOR_STATE;
104
105    // parses the name char-by-char and add the words identified in result list
106    while ( i < len ) {
107      current = source.charAt( i++ );
108
109      switch ( state ) {
110        case SEPARATOR_STATE: // only separators have been parsed (if any)
111          if ( Character.isLetter( current ) ) { // Class name may start only with a letter
112            word.append( current );
113
114            if ( Character.isUpperCase( current ) ) { // a word started in upper case
115              state = WORD_WITH_FIRST_UPPER_STATE;
116
117            } else { // a word started in lower case
118              state = WORD_IN_LOWER_CASE_STATE;
119            }
120          } else if ( Character.isDigit( current ) && result.size() > 0 ) { // a word started with a digit, but not the first word in the identifier
121            word.append( current );
122            state = WORD_WITH_DIGITS_STATE;
123
124          } // any word separators are skipped
125          break;
126
127        case WORD_WITH_FIRST_UPPER_STATE: // the first word letter has been added in capital
128          if ( Character.isLetterOrDigit( current ) ) {
129            if ( Character.isLowerCase( current ) ) { // may be this lower case letter is the last
130                                                      // of this word
131              word.append( current );
132              state = WORD_IN_LOWER_CASE_STATE;
133
134            } else if ( Character.isDigit( current ) ) { // the current word continues with digits
135              word.append( current );
136              state = WORD_WITH_DIGITS_STATE;
137
138            } else { // the second letter of this word is in upper case
139
140              if ( i < len && Character.isLowerCase( source.charAt( i ) ) ) { // a new word with
141                                                                              // first capital and
142                                                                              // next lower
143                // starts with the current letter if it is capital
144                // complete this word
145                result.add( word.toString() );
146
147                // construct a new word starting from this - it is with first capital (and next
148                // lower) case letters
149                word.setLength( 0 );
150                word.append( current );
151
152              } else { // this word is (by now) all in capital
153                word.append( current );
154              }
155            }
156          } else { //not a letter or digit recognized - this is a terminator
157            // complete this word
158            result.add( word.toString() );
159
160            // construct an empty new word starting from this
161            word.setLength( 0 );
162            state = SEPARATOR_STATE;
163          }
164          break;
165
166        case WORD_IN_LOWER_CASE_STATE: // any upper case letter will be treated as a new word start
167          if ( Character.isLowerCase( current ) ) { // the same word continues
168            word.append( current );
169
170          } else if ( Character.isDigit( current ) ) { // the current word continues with digits
171            word.append( current );
172            state = WORD_WITH_DIGITS_STATE;
173
174          } else { // this word terminates
175            // store the current word
176            result.add( word.toString() );
177
178            // start the new word
179            word.setLength( 0 );
180
181            if ( Character.isUpperCase( current ) ) { // a new word started
182              word.append( current );
183              state = WORD_WITH_FIRST_UPPER_STATE;
184            } else { // a terminator found
185              state = SEPARATOR_STATE;
186            }
187          }
188          break;
189        case WORD_WITH_DIGITS_STATE: // any non-digit letter will be treated as a new word start
190          if ( Character.isDigit( current ) ) { // the same word continues
191            word.append( current );
192          } else {
193            // store the current word
194            result.add( word.toString() );
195
196            // start the new word
197            word.setLength( 0 );
198
199            if ( Character.isLetter( current ) ) { // a new word begins
200              word.append( current );
201
202              if ( Character.isUpperCase( current ) ) { // a new word started in upper case
203                state = WORD_WITH_FIRST_UPPER_STATE;
204
205              } else { // a word started in lower case
206                state = WORD_IN_LOWER_CASE_STATE;
207              }
208            } else { // non-alphanumeric letters are treated as separators
209              state = SEPARATOR_STATE;
210            }
211          }
212          break;
213      }
214    }
215    //  store the last word parsed
216    if ( word.length() > 0 ) {
217      result.add( word.toString() );
218    }
219    return result;
220  }
221
222
223  /**
224   * This method removes the word just retrieved by next(). Its usage in the context of this class
225   * seems to be obsolete. It is provided for completeness.
226   *
227   * @see java.util.Iterator#remove()
228   */
229  public void remove() {
230    wordsIterator.remove();
231  }
232
233
234  /**
235   * This method checks if there is a following word identified.
236   *
237   * @return true is one exists
238   * @see java.util.Iterator#hasNext()
239   */
240  public boolean hasNext() {
241    return wordsIterator.hasNext();
242  }
243
244
245  /**
246   * This method returns the next word identified if one exists. The object it returns is a string.
247   *
248   * @return a string representing the next word identified
249   * @see java.util.Iterator#next()
250   */
251  public String next() {
252    return wordsIterator.next();
253  }
254}