001 /*
002 * Java Genetic Algorithm Library (jenetics-7.1.0).
003 * Copyright (c) 2007-2022 Franz Wilhelmstötter
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 *
017 * Author:
018 * Franz Wilhelmstötter (franz.wilhelmstoetter@gmail.com)
019 */
020 package io.jenetics.ext.grammar;
021
022 import static java.lang.String.format;
023 import static java.util.Objects.requireNonNull;
024 import static java.util.stream.Collectors.groupingBy;
025 import static java.util.stream.Collectors.toCollection;
026
027 import java.util.ArrayList;
028 import java.util.HashMap;
029 import java.util.HashSet;
030 import java.util.LinkedHashMap;
031 import java.util.List;
032 import java.util.Map;
033 import java.util.Optional;
034 import java.util.Set;
035 import java.util.function.Function;
036 import java.util.stream.Collectors;
037 import java.util.stream.Stream;
038
039 /**
040 * Represents a <em>context-free</em> grammar
041 * (<a href="https://en.wikipedia.org/wiki/Context-free_grammar"><b>CFG</b></a>).
042 * <p>
043 * <b>Formal definition</b>
044 * <p>
045 * A context-free grammar {@code G} is defined by the 4-tuple
046 * {@code G = (N, T, R, S)}, where
047 * <ul>
048 * <li>{@code N} is a finite set; each element {@code n ∈ N} is called a
049 * non-terminal ({@link NonTerminal}) character or a variable. Each
050 * variable represents a different type of phrase or clause in the sentence.
051 * Variables are also sometimes called syntactic categories. Each variable
052 * defines a sub-language of the language defined by {@code G}.
053 * </li>
054 * <li>{@code T} is a finite set of terminals ({@link Terminal}) disjoint
055 * from {@code N}, which make up the actual content of the sentence. The set
056 * of terminals is the alphabet of the language defined by the grammar
057 * {@code G}.
058 * </li>
059 * <li>{@code R} is a finite relation in {@code N × (N ∪ T)∗}, where the
060 * asterisk represents the <a href="https://en.wikipedia.org/wiki/Kleene_star">
061 * Kleene star</a> operation. The members of {@code R} are called the
062 * (rewrite) rules ({@link Rule}) or productions of the grammar.
063 * </li>
064 * <li>{@code S} is the start variable (or start symbol), used to represent
065 * the whole sentence (or program). It must be an element of {@code N}
066 * ({@link NonTerminal})
067 * .</li>
068 * </ul>
069 *
070 * You can easily create a <em>Cfg</em> object from a given BNF grammar.
071 * <pre>{@code
072 * final Cfg<String> grammar = Bnf.parse("""
073 * <expr> ::= <num> | <var> | '(' <expr> <op> <expr> ')'
074 * <op> ::= + | - | * | /
075 * <var> ::= x | y
076 * <num> ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
077 * """
078 * );
079 * }</pre>
080 *
081 * It is also possible to create the grammar above programmatically.
082 * <pre>{@code
083 * final Cfg<String> grammar = Cfg.of(
084 * R("expr",
085 * E(NT("num")),
086 * E(NT("var")),
087 * E(T("("), NT("expr"), NT("op"), NT("expr"), T(")"))
088 * ),
089 * R("op", E(T("+")), E(T("-")), E(T("*")), E(T("/"))),
090 * R("var", E(T("x")), E(T("y"))),
091 * R("num",
092 * E(T("0")), E(T("1")), E(T("2")), E(T("3")),
093 * E(T("4")), E(T("5")), E(T("6")), E(T("7")),
094 * E(T("8")), E(T("9"))
095 * )
096 * );
097 * }</pre>
098 *
099 * @see Bnf#parse(String)
100 *
101 * @param <T> the terminal symbol value type
102 *
103 * @author <a href="mailto:franz.wilhelmstoetter@gmail.com">Franz Wilhelmstötter</a>
104 * @since 7.1
105 * @version 7.1
106 */
107 public record Cfg<T>(
108 List<NonTerminal<T>> nonTerminals,
109 List<Terminal<T>> terminals,
110 List<Rule<T>> rules,
111 NonTerminal<T> start
112 ) {
113
114 /**
115 * Represents the <em>symbols</em> the BNF grammar consists.
116 *
117 * @param <T> the terminal symbol value type
118 */
119 public sealed interface Symbol<T> {
120
121 /**
122 * Return the name of the symbol.
123 *
124 * @return the name of the symbol
125 */
126 String name();
127
128 }
129
130 /**
131 * Represents the non-terminal symbols of the grammar ({@code NT}).
132 *
133 * @param <T> the terminal symbol value type
134 */
135 public record NonTerminal<T>(String name) implements Symbol<T> {
136
137 /**
138 * @param name the name of the non-terminal symbol
139 * @throws IllegalArgumentException if the given {@code name} is not
140 * a valid <em>non-terminal</em> name
141 * @throws NullPointerException if one of the arguments is {@code null}
142 */
143 public NonTerminal {
144 if (name.isEmpty()) {
145 throw new IllegalArgumentException(
146 "Non-terminal value must not be empty."
147 );
148 }
149 }
150 }
151
152 /**
153 * Represents a terminal symbols of the grammar ({@code T}).
154 *
155 * @param <T> the terminal symbol value type
156 */
157 public record Terminal<T>(String name, T value) implements Symbol<T> {
158
159 /**
160 * @param name the name of the terminal symbol
161 * @param value the value of the terminal symbol
162 * @throws IllegalArgumentException if the given terminal {@code name}
163 * is empty
164 */
165 public Terminal {
166 if (name.isEmpty()) {
167 throw new IllegalArgumentException(
168 "Terminal value must not be empty."
169 );
170 }
171 }
172
173 /**
174 * Return a new terminal symbol where the name of the symbol is equal
175 * to its value.
176 *
177 * @param name the name (and value) of the terminal symbol
178 * @return a new terminal symbol with the given {@code name}
179 * @throws IllegalArgumentException if the given terminal {@code name}
180 * is empty
181 */
182 public static Terminal<String> of(final String name) {
183 return new Terminal<>(name, name);
184 }
185
186 }
187
188 /**
189 * Represents one <em>expression</em> (list of alternative symbols) a
190 * production rule consists of.
191 *
192 * @param <T> the terminal symbol value type
193 */
194 public record Expression<T>(List<Symbol<T>> symbols) {
195
196 /**
197 * @param symbols the list of symbols of the expression
198 * @throws IllegalArgumentException if the list of {@code symbols} is
199 * empty
200 */
201 public Expression {
202 if (symbols.isEmpty()) {
203 throw new IllegalArgumentException(
204 "The list of symbols must not be empty."
205 );
206 }
207 symbols = List.copyOf(symbols);
208 }
209
210 }
211
212 /**
213 * Represents a production rule of the grammar ({@code R}).
214 *
215 * @param <T> the terminal symbol value type
216 */
217 public record Rule<T>(NonTerminal<T> start, List<Expression<T>> alternatives) {
218
219 /**
220 * Creates a new rule object.
221 *
222 * @param start the start symbol of the rule
223 * @param alternatives the list af alternative rule expressions
224 * @throws IllegalArgumentException if the given list of
225 * {@code alternatives} is empty
226 * @throws NullPointerException if one of the arguments is {@code null}
227 */
228 public Rule {
229 requireNonNull(start);
230 if (alternatives.isEmpty()) {
231 throw new IllegalArgumentException(
232 "Rule alternatives must not be empty."
233 );
234 }
235 alternatives = List.copyOf(alternatives);
236 }
237
238 }
239
240 /**
241 * Create a new <em>context-free</em> grammar object.
242 *
243 * @param nonTerminals the non-terminal symbols of {@code this} grammar
244 * @param terminals the terminal symbols of {@code this} grammar
245 * @param rules the <em>production</em> rules of {@code this} grammar
246 * @param start the start symbol of {@code this} grammar
247 * @throws NullPointerException if one of the arguments is {@code null}
248 * @throws IllegalArgumentException if a rule is defined more than once, the
249 * start symbol points to a missing rule or the rules uses symbols
250 * not defined in the list of {@link #nonTerminals()} or
251 * {@link #terminals()}
252 */
253 public Cfg {
254 if (rules.isEmpty()) {
255 throw new IllegalArgumentException(
256 "The given list of rules must not be empty."
257 );
258 }
259
260 // Check uniqueness of the rules.
261 final var duplicatedRules = rules.stream()
262 .collect(Collectors.groupingBy(Rule::start))
263 .values().stream()
264 .filter(values -> values.size() > 1)
265 .map(rule -> rule.get(0).start.name)
266 .toList();
267
268 if (!duplicatedRules.isEmpty()) {
269 throw new IllegalArgumentException(
270 "Found duplicate rule(s), " + duplicatedRules + "."
271 );
272 }
273
274 // Check if start symbol points to an existing rule.
275 final var startRule = rules.stream()
276 .filter(r -> start.equals(r.start))
277 .findFirst();
278 if (startRule.isEmpty()) {
279 throw new IllegalArgumentException(
280 "No rule found for start symbol %s.".formatted(start)
281 );
282 }
283
284 // Check that all symbols used in the given rules are also defined
285 // in the list of non-terminals and terminals.
286 final Set<Symbol<T>> required = rules.stream()
287 .flatMap(Cfg::ruleSymbols)
288 .collect(Collectors.toUnmodifiableSet());
289
290 final Set<Symbol<T>> available = Stream
291 .concat(nonTerminals.stream(), terminals.stream())
292 .collect(Collectors.toUnmodifiableSet());
293
294 final var missing = new HashSet<>(required);
295 missing.removeAll(available);
296
297 if (!missing.isEmpty()) {
298 throw new IllegalArgumentException(
299 "Unknown symbols defined in rules: " + missing
300 );
301 }
302
303 // Check if the name of terminals and non-terminals are distinct.
304 final var terminalNames = terminals.stream()
305 .map(Symbol::name)
306 .collect(Collectors.toSet());
307
308 final var nonTerminalNames = nonTerminals.stream()
309 .map(Symbol::name)
310 .collect(Collectors.toSet());
311
312 terminalNames.retainAll(nonTerminalNames);
313 if (!terminalNames.isEmpty()) {
314 throw new IllegalArgumentException(format(
315 "Terminal and non-terminal symbols with same name: %s",
316 terminalNames.stream().sorted().toList()
317 ));
318 }
319
320 nonTerminals = List.copyOf(nonTerminals);
321 terminals = List.copyOf(terminals);
322 rules = List.copyOf(rules);
323 requireNonNull(start);
324 }
325
326 /**
327 * Return the rule for the given {@code start} symbol.
328 *
329 * @param start the start symbol of the rule
330 * @return the rule for the given {@code start} symbol
331 * @throws NullPointerException if the given {@code start} symbol is
332 * {@code null}
333 */
334 public Optional<Rule<T>> rule(final NonTerminal<?> start) {
335 requireNonNull(start);
336 for (var rule : rules) {
337 if (rule.start().name().equals(start.name())) {
338 return Optional.of(rule);
339 }
340 }
341 return Optional.empty();
342 }
343
344 /**
345 * Maps the values of the terminal symbols from type {@code T} to type
346 * {@code A}.
347 *
348 * @param mapper the mapper function
349 * @param <A> the new value type of the terminal symbols
350 * @return the mapped grammar
351 * @throws NullPointerException if the given mapper is {@code null}
352 */
353 public <A> Cfg<A> map(final Function<? super Terminal<T>, ? extends A> mapper) {
354 requireNonNull(mapper);
355
356 final var cache = new HashMap<Terminal<T>, Terminal<A>>();
357 final Function<Terminal<T>, Terminal<A>> mapping = t -> cache
358 .computeIfAbsent(t, t2 -> new Terminal<>(t2.name(), mapper.apply(t2)));
359
360 @SuppressWarnings("unchecked")
361 final List<Rule<A>> rules = rules().stream()
362 .map(rule -> new Rule<>(
363 (NonTerminal<A>)rule.start(),
364 rule.alternatives().stream()
365 .map(expr -> new Expression<>(
366 expr.symbols().stream()
367 .map(sym -> sym instanceof Cfg.Terminal<T> t
368 ? mapping.apply(t) : (Symbol<A>)sym)
369 .toList()
370 ))
371 .toList()
372 ))
373 .toList();
374
375 return Cfg.of(rules);
376 }
377
378 /**
379 * Create a grammar object with the given rules. Duplicated rules are merged
380 * into one rule. The <em>start</em> symbol of the first rule is chosen as
381 * the start symbol of the created CFG
382 *
383 * @param rules the rules the grammar consists of
384 * @throws IllegalArgumentException if the list of rules is empty
385 * @throws NullPointerException if the list of rules is {@code null}
386 */
387 public static <T> Cfg<T> of(final List<Rule<T>> rules) {
388 if (rules.isEmpty()) {
389 throw new IllegalArgumentException(
390 "The list of rules must not be empty."
391 );
392 }
393
394 final List<Rule<T>> normalizedRules = normalize(rules);
395
396 final List<Symbol<T>> symbols = normalizedRules.stream()
397 .flatMap(Cfg::ruleSymbols)
398 .distinct()
399 .toList();
400
401 final List<NonTerminal<T>> nonTerminals = symbols.stream()
402 .filter(NonTerminal.class::isInstance)
403 .map(nt -> (NonTerminal<T>)nt)
404 .toList();
405
406 final List<Terminal<T>> terminals = symbols.stream()
407 .filter(Terminal.class::isInstance)
408 .map(nt -> (Terminal<T>)nt)
409 .toList();
410
411 return new Cfg<>(
412 nonTerminals,
413 terminals,
414 normalizedRules.stream()
415 .map(r -> rebuild(r, symbols))
416 .toList(),
417 (NonTerminal<T>)select(normalizedRules.get(0).start(), symbols)
418 );
419 }
420
421 /**
422 * Create a grammar object with the given rules. Duplicated rules are merged
423 * into one rule. The <em>start</em> symbol of the first rule is chosen as
424 * the start symbol of the created CFG
425 *
426 * @param rules the rules the grammar consists of
427 * @throws IllegalArgumentException if the list of rules is empty
428 * @throws NullPointerException if the list of rules is {@code null}
429 */
430 @SafeVarargs
431 public static <T> Cfg<T> of(final Rule<T>... rules) {
432 return Cfg.of(List.of(rules));
433 }
434
435 private static <T> List<Rule<T>> normalize(final List<Rule<T>> rules) {
436 final Map<NonTerminal<T>, List<Rule<T>>> grouped = rules.stream()
437 .collect(groupingBy(
438 Rule::start,
439 LinkedHashMap::new,
440 toCollection(ArrayList::new)));
441
442 return grouped.entrySet().stream()
443 .map(entry -> merge(entry.getKey(), entry.getValue()))
444 .toList();
445 }
446
447 private static <T> Rule<T> merge(final NonTerminal<T> start, final List<Rule<T>> rules) {
448 return new Rule<>(
449 start,
450 rules.stream()
451 .flatMap(rule -> rule.alternatives().stream())
452 .toList()
453 );
454 }
455
456 private static <T> Stream<Symbol<T>> ruleSymbols(final Rule<T> rule) {
457 return Stream.concat(
458 Stream.of(rule.start),
459 rule.alternatives.stream()
460 .flatMap(expr -> expr.symbols().stream())
461 );
462 }
463
464 private static <T> Rule<T> rebuild(final Rule<T> rule, final List<Symbol<T>> symbols) {
465 return new Rule<>(
466 (NonTerminal<T>)select(rule.start, symbols),
467 rule.alternatives.stream()
468 .map(e -> rebuild(e, symbols))
469 .toList()
470 );
471 }
472
473 private static <T> Expression<T>
474 rebuild(final Expression<T> expression, final List<Symbol<T>> symbols) {
475 return new Expression<>(
476 expression.symbols.stream()
477 .map(s -> select(s, symbols))
478 .toList()
479 );
480 }
481
482 private static <T> Symbol<T> select(
483 final Symbol<T> symbol,
484 final List<Symbol<T>> symbols
485 ) {
486 for (var s : symbols) {
487 if (s.name().equals(symbol.name())) {
488 return s;
489 }
490 }
491 throw new AssertionError("Symbol not found: " + symbol);
492 }
493
494 @SuppressWarnings("unchecked")
495 static <A, B extends A> Cfg<A> upcast(final Cfg<B> seq) {
496 return (Cfg<A>)seq;
497 }
498
499
500 /* *************************************************************************
501 * Static factory methods for rule creation.
502 * ************************************************************************/
503
504 /**
505 * Factory method for creating a terminal symbol with the given
506 * {@code name} and {@code value}.
507 *
508 * @param name the name of the terminal symbol
509 * @param value the value of the terminal symbol
510 * @param <T> the terminal symbol value type
511 * @return a new terminal symbol
512 */
513 public static <T> Terminal<T> T(final String name, final T value) {
514 return new Terminal<>(name, value);
515 }
516
517 /**
518 * Factory method for creating a terminal symbol with the given
519 * {@code name}.
520 *
521 * @param name the name of the terminal symbol
522 * @return a new terminal symbol
523 */
524 public static Terminal<String> T(final String name) {
525 return new Terminal<>(name, name);
526 }
527
528 /**
529 * Factory method for creating non-terminal symbols.
530 *
531 * @param name the name of the symbol.
532 * @param <T> the terminal symbol value type
533 * @return a new non-terminal symbol
534 */
535 public static <T> NonTerminal<T> N(final String name) {
536 return new NonTerminal<>(name);
537 }
538
539 /**
540 * Factory method for creating an expression with the given
541 * {@code symbols}.
542 *
543 * @param symbols the list of symbols of the expression
544 * @throws IllegalArgumentException if the list of {@code symbols} is
545 * empty
546 * @param <T> the terminal symbol value type
547 * @return a new expression
548 */
549 @SafeVarargs
550 public static <T> Expression<T> E(final Symbol<T>... symbols) {
551 return new Expression<>(List.of(symbols));
552 }
553
554 /**
555 * Factory method for creating a new rule.
556 *
557 * @param name the name of start symbol of the rule
558 * @param alternatives the list af alternative rule expressions
559 * @throws IllegalArgumentException if the given list of
560 * {@code alternatives} is empty
561 * @throws NullPointerException if one of the arguments is {@code null}
562 * @param <T> the terminal symbol value type
563 * @return a new rule
564 */
565 @SafeVarargs
566 public static <T> Rule<T> R(
567 final String name,
568 final Expression<T>... alternatives
569 ) {
570 return new Rule<>(new NonTerminal<>(name), List.of(alternatives));
571 }
572
573 }
|