001/*
002 * Java Genetic Algorithm Library (jenetics-7.2.0).
003 * Copyright (c) 2007-2023 Franz Wilhelmstötter
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 *
017 * Author:
018 *    Franz Wilhelmstötter (franz.wilhelmstoetter@gmail.com)
019 */
020package io.jenetics.ext.grammar;
021
022import static java.lang.String.format;
023import static java.util.Objects.requireNonNull;
024import static java.util.stream.Collectors.groupingBy;
025import static java.util.stream.Collectors.toCollection;
026
027import java.util.ArrayList;
028import java.util.HashMap;
029import java.util.HashSet;
030import java.util.LinkedHashMap;
031import java.util.List;
032import java.util.Map;
033import java.util.Optional;
034import java.util.Set;
035import java.util.function.Function;
036import java.util.stream.Collectors;
037import java.util.stream.Stream;
038
039/**
040 * Represents a <em>context-free</em> grammar
041 * (<a href="https://en.wikipedia.org/wiki/Context-free_grammar"><b>CFG</b></a>).
042 * <p>
043 * <b>Formal definition</b>
044 * <p>
045 * A context-free grammar {@code G} is defined by the 4-tuple
046 * {@code G = (N, T, R, S)}, where
047 * <ul>
048 *     <li>{@code N} is a finite set; each element {@code n ∈ N} is called a
049 *     non-terminal ({@link NonTerminal}) character or a variable. Each
050 *     variable represents a different type of phrase or clause in the sentence.
051 *     Variables are also sometimes called syntactic categories. Each variable
052 *     defines a sub-language of the language defined by {@code G}.
053 *     </li>
054 *     <li>{@code T} is a finite set of terminals ({@link Terminal}) disjoint
055 *     from {@code N}, which make up the actual content of the sentence. The set
056 *     of terminals is the alphabet of the language defined by the grammar
057 *     {@code G}.
058 *     </li>
059 *     <li>{@code R} is a finite relation in {@code N × (N ∪ T)∗}, where the
060 *     asterisk represents the <a href="https://en.wikipedia.org/wiki/Kleene_star">
061 *     Kleene star</a> operation. The members of {@code R} are called the
062 *     (rewrite) rules ({@link Rule}) or productions of the grammar.
063 *     </li>
064 *     <li>{@code S} is the start variable (or start symbol), used to represent
065 *     the whole sentence (or program). It must be an element of {@code N}
066 *     ({@link NonTerminal})
067 *     .</li>
068 * </ul>
069 *
070 * You can easily create a <em>Cfg</em> object from a given BNF grammar.
071 * <pre>{@code
072 * final Cfg<String> grammar = Bnf.parse("""
073 *     <expr> ::= <num> | <var> | '(' <expr> <op> <expr> ')'
074 *     <op>   ::= + | - | * | /
075 *     <var>  ::= x | y
076 *     <num>  ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
077 *     """
078 * );
079 * }</pre>
080 *
081 * It is also possible to create the grammar above programmatically.
082 * <pre>{@code
083 * final Cfg<String> grammar = Cfg.of(
084 *     R("expr",
085 *         E(NT("num")),
086 *         E(NT("var")),
087 *         E(T("("), NT("expr"), NT("op"), NT("expr"), T(")"))
088 *     ),
089 *     R("op", E(T("+")), E(T("-")), E(T("*")), E(T("/"))),
090 *     R("var", E(T("x")), E(T("y"))),
091 *     R("num",
092 *         E(T("0")), E(T("1")), E(T("2")), E(T("3")),
093 *         E(T("4")), E(T("5")), E(T("6")), E(T("7")),
094 *         E(T("8")), E(T("9"))
095 *     )
096 * );
097 * }</pre>
098 *
099 * @see Bnf#parse(String)
100 *
101 * @param <T> the terminal symbol value type
102 *
103 * @author <a href="mailto:franz.wilhelmstoetter@gmail.com">Franz Wilhelmstötter</a>
104 * @since 7.1
105 * @version 7.1
106 */
107public record Cfg<T>(
108        List<NonTerminal<T>> nonTerminals,
109        List<Terminal<T>> terminals,
110        List<Rule<T>> rules,
111        NonTerminal<T> start
112) {
113
114        /**
115         * Represents the <em>symbols</em> the BNF grammar consists.
116         *
117         * @param <T> the terminal symbol value type
118         */
119        public sealed interface Symbol<T> {
120
121                /**
122                 * Return the name of the symbol.
123                 *
124                 * @return the name of the symbol
125                 */
126                String name();
127
128        }
129
130        /**
131         * Represents the non-terminal symbols of the grammar ({@code NT}).
132         *
133         * @param <T> the terminal symbol value type
134         */
135        public record NonTerminal<T>(String name) implements Symbol<T> {
136
137                /**
138                 * @param name the name of the non-terminal symbol
139                 * @throws IllegalArgumentException if the given {@code name} is not
140                 *         a valid <em>non-terminal</em> name
141                 * @throws NullPointerException if one of the arguments is {@code null}
142                 */
143                public NonTerminal {
144                        if (name.isEmpty()) {
145                                throw new IllegalArgumentException(
146                                        "Non-terminal value must not be empty."
147                                );
148                        }
149                }
150        }
151
152        /**
153         * Represents a terminal symbols of the grammar ({@code T}).
154         *
155         * @param <T> the terminal symbol value type
156         */
157        public record Terminal<T>(String name, T value) implements Symbol<T> {
158
159                /**
160                 * @param name the name of the terminal symbol
161                 * @param value the value of the terminal symbol
162                 * @throws IllegalArgumentException if the given terminal {@code name}
163                 *         is empty
164                 */
165                public Terminal {
166                        if (name.isEmpty()) {
167                                throw new IllegalArgumentException(
168                                        "Terminal value must not be empty."
169                                );
170                        }
171                }
172
173                /**
174                 * Return a new terminal symbol where the name of the symbol is equal
175                 * to its value.
176                 *
177                 * @param name the name (and value) of the terminal symbol
178                 * @return a new terminal symbol with the given {@code name}
179                 * @throws IllegalArgumentException if the given terminal {@code name}
180                 *         is empty
181                 */
182                public static Terminal<String> of(final String name) {
183                        return new Terminal<>(name, name);
184                }
185
186        }
187
188        /**
189         * Represents one <em>expression</em> (list of alternative symbols) a
190         * production rule consists of.
191         *
192         * @param <T> the terminal symbol value type
193         */
194        public record Expression<T>(List<Symbol<T>> symbols) {
195
196                /**
197                 * @param symbols the list of symbols of the expression
198                 * @throws IllegalArgumentException if the list of {@code symbols} is
199                 *         empty
200                 */
201                public Expression {
202                        if (symbols.isEmpty()) {
203                                throw new IllegalArgumentException(
204                                        "The list of symbols must not be empty."
205                                );
206                        }
207                        symbols = List.copyOf(symbols);
208                }
209
210        }
211
212        /**
213         * Represents a production rule of the grammar ({@code R}).
214         *
215         * @param <T> the terminal symbol value type
216         */
217        public record Rule<T>(NonTerminal<T> start, List<Expression<T>> alternatives) {
218
219                /**
220                 * Creates a new rule object.
221                 *
222                 * @param start the start symbol of the rule
223                 * @param alternatives the list of alternative rule expressions
224                 * @throws IllegalArgumentException if the given list of
225                 *         {@code alternatives} is empty
226                 * @throws NullPointerException if one of the arguments is {@code null}
227                 */
228                public Rule {
229                        requireNonNull(start);
230                        if (alternatives.isEmpty()) {
231                                throw new IllegalArgumentException(
232                                        "Rule alternatives must not be empty."
233                                );
234                        }
235                        alternatives = List.copyOf(alternatives);
236                }
237
238        }
239
240        /**
241         * Create a new <em>context-free</em> grammar object.
242         *
243         * @param nonTerminals the non-terminal symbols of {@code this} grammar
244         * @param terminals the terminal symbols of {@code this} grammar
245         * @param rules the <em>production</em> rules of {@code this} grammar
246         * @param start the start symbol of {@code this} grammar
247         * @throws NullPointerException if one of the arguments is {@code null}
248         * @throws IllegalArgumentException if a rule is defined more than once, the
249         *         start symbol points to a missing rule or the rules uses symbols
250         *         not defined in the list of {@link #nonTerminals()} or
251         *         {@link #terminals()}
252         */
253        public Cfg {
254                if (rules.isEmpty()) {
255                        throw new IllegalArgumentException(
256                                "The given list of rules must not be empty."
257                        );
258                }
259
260                // Check the uniqueness of the rules.
261                final var duplicatedRules = rules.stream()
262                        .collect(Collectors.groupingBy(Rule::start))
263                        .values().stream()
264                        .filter(values -> values.size() > 1)
265                        .map(rule -> rule.get(0).start.name)
266                        .toList();
267
268                if (!duplicatedRules.isEmpty()) {
269                        throw new IllegalArgumentException(
270                                "Found duplicate rule(s), " + duplicatedRules + "."
271                        );
272                }
273
274                // Check if start symbol points to an existing rule.
275                final var startRule = rules.stream()
276                        .filter(r -> start.equals(r.start))
277                        .findFirst();
278                if (startRule.isEmpty()) {
279                        throw new IllegalArgumentException(
280                                "No rule found for start symbol %s.".formatted(start)
281                        );
282                }
283
284                // Check that all symbols used in the given rules are also defined
285                // in the list of non-terminals and terminals.
286                final Set<Symbol<T>> required = rules.stream()
287                        .flatMap(Cfg::ruleSymbols)
288                        .collect(Collectors.toUnmodifiableSet());
289
290                final Set<Symbol<T>> available = Stream
291                        .concat(nonTerminals.stream(), terminals.stream())
292                        .collect(Collectors.toUnmodifiableSet());
293
294                final var missing = new HashSet<>(required);
295                missing.removeAll(available);
296
297                if (!missing.isEmpty()) {
298                        throw new IllegalArgumentException(
299                                "Unknown symbols defined in rules: " + missing
300                        );
301                }
302
303                // Check if the name of terminals and non-terminals are distinct.
304                final var terminalNames = terminals.stream()
305                        .map(Symbol::name)
306                        .collect(Collectors.toSet());
307
308                final var nonTerminalNames = nonTerminals.stream()
309                        .map(Symbol::name)
310                        .collect(Collectors.toSet());
311
312                terminalNames.retainAll(nonTerminalNames);
313                if (!terminalNames.isEmpty()) {
314                        throw new IllegalArgumentException(format(
315                                "Terminal and non-terminal symbols with same name: %s",
316                                terminalNames.stream().sorted().toList()
317                        ));
318                }
319
320                nonTerminals = List.copyOf(nonTerminals);
321                terminals = List.copyOf(terminals);
322                rules = List.copyOf(rules);
323                requireNonNull(start);
324        }
325
326        /**
327         * Return the rule for the given {@code start} symbol.
328         *
329         * @param start the start symbol of the rule
330         * @return the rule for the given {@code start} symbol
331         * @throws NullPointerException if the given {@code start} symbol is
332         *         {@code null}
333         */
334        public Optional<Rule<T>> rule(final NonTerminal<?> start) {
335                requireNonNull(start);
336                for (var rule : rules) {
337                        if (rule.start().name().equals(start.name())) {
338                                return Optional.of(rule);
339                        }
340                }
341                return Optional.empty();
342        }
343
344        /**
345         * Maps the values of the terminal symbols from type {@code T} to type
346         * {@code A}.
347         *
348         * @param mapper the mapper function
349         * @param <A> the new value type of the terminal symbols
350         * @return the mapped grammar
351         * @throws NullPointerException if the given mapper is {@code null}
352         */
353        public <A> Cfg<A> map(final Function<? super Terminal<T>, ? extends A> mapper) {
354                requireNonNull(mapper);
355
356                final var cache = new HashMap<Terminal<T>, Terminal<A>>();
357                final Function<Terminal<T>, Terminal<A>> mapping = t -> cache
358                        .computeIfAbsent(t, t2 -> new Terminal<>(t2.name(), mapper.apply(t2)));
359
360                @SuppressWarnings("unchecked")
361                final List<Rule<A>> rules = rules().stream()
362                        .map(rule -> new Rule<>(
363                                (NonTerminal<A>)rule.start(),
364                                rule.alternatives().stream()
365                                        .map(expr -> new Expression<>(
366                                                expr.symbols().stream()
367                                                        .map(sym -> sym instanceof Cfg.Terminal<T> t
368                                                                ? mapping.apply(t) : (Symbol<A>)sym)
369                                                        .toList()
370                                                ))
371                                        .toList()
372                                ))
373                        .toList();
374
375                return Cfg.of(rules);
376        }
377
378        /**
379         * Create a grammar object with the given rules. Duplicated rules are merged
380         * into one rule. The <em>start</em> symbol of the first rule is chosen as
381         * the start symbol of the created CFG
382         *
383         * @param rules the rules the grammar consists of
384         * @throws IllegalArgumentException if the list of rules is empty
385         * @throws NullPointerException if the list of rules is {@code null}
386         */
387        public static <T> Cfg<T> of(final List<Rule<T>> rules) {
388                if (rules.isEmpty()) {
389                        throw new IllegalArgumentException(
390                                "The list of rules must not be empty."
391                        );
392                }
393
394                final List<Rule<T>> normalizedRules = normalize(rules);
395
396                final List<Symbol<T>> symbols = normalizedRules.stream()
397                        .flatMap(Cfg::ruleSymbols)
398                        .distinct()
399                        .toList();
400
401                final List<NonTerminal<T>> nonTerminals = symbols.stream()
402                        .filter(NonTerminal.class::isInstance)
403                        .map(nt -> (NonTerminal<T>)nt)
404                        .toList();
405
406                final List<Terminal<T>> terminals = symbols.stream()
407                        .filter(Terminal.class::isInstance)
408                        .map(nt -> (Terminal<T>)nt)
409                        .toList();
410
411                return new Cfg<>(
412                        nonTerminals,
413                        terminals,
414                        normalizedRules.stream()
415                                .map(r -> rebuild(r, symbols))
416                                .toList(),
417                        (NonTerminal<T>)select(normalizedRules.get(0).start(), symbols)
418                );
419        }
420
421        /**
422         * Create a grammar object with the given rules. Duplicated rules are merged
423         * into one rule. The <em>start</em> symbol of the first rule is chosen as
424         * the start symbol of the created CFG
425         *
426         * @param rules the rules the grammar consists of
427         * @throws IllegalArgumentException if the list of rules is empty
428         * @throws NullPointerException if the list of rules is {@code null}
429         */
430        @SafeVarargs
431        public static <T> Cfg<T> of(final Rule<T>... rules) {
432                return Cfg.of(List.of(rules));
433        }
434
435        private static <T> List<Rule<T>> normalize(final List<Rule<T>> rules) {
436                final Map<NonTerminal<T>, List<Rule<T>>> grouped = rules.stream()
437                        .collect(groupingBy(
438                                Rule::start,
439                                LinkedHashMap::new,
440                                toCollection(ArrayList::new)));
441
442                return grouped.entrySet().stream()
443                        .map(entry -> merge(entry.getKey(), entry.getValue()))
444                        .toList();
445        }
446
447        private static <T> Rule<T> merge(final NonTerminal<T> start, final List<Rule<T>> rules) {
448                return new Rule<>(
449                        start,
450                        rules.stream()
451                                .flatMap(rule -> rule.alternatives().stream())
452                                .toList()
453                );
454        }
455
456        private static <T> Stream<Symbol<T>> ruleSymbols(final Rule<T> rule) {
457                return Stream.concat(
458                        Stream.of(rule.start),
459                        rule.alternatives.stream()
460                                .flatMap(expr -> expr.symbols().stream())
461                );
462        }
463
464        private static <T> Rule<T> rebuild(final Rule<T> rule, final List<Symbol<T>> symbols) {
465                return new Rule<>(
466                        (NonTerminal<T>)select(rule.start, symbols),
467                        rule.alternatives.stream()
468                                .map(e -> rebuild(e, symbols))
469                                .toList()
470                );
471        }
472
473        private static <T> Expression<T>
474        rebuild(final Expression<T> expression, final List<Symbol<T>> symbols) {
475                return new Expression<>(
476                        expression.symbols.stream()
477                                .map(s -> select(s, symbols))
478                                .toList()
479                );
480        }
481
482        private static <T> Symbol<T> select(
483                final Symbol<T> symbol,
484                final List<Symbol<T>> symbols
485        ) {
486                for (var s : symbols) {
487                        if (s.name().equals(symbol.name())) {
488                                return s;
489                        }
490                }
491                throw new AssertionError("Symbol not found: " + symbol);
492        }
493
494        @SuppressWarnings("unchecked")
495        static <A, B extends A> Cfg<A> upcast(final Cfg<B> seq) {
496                return (Cfg<A>)seq;
497        }
498
499
500        /* *************************************************************************
501         * Static factory methods for rule creation.
502         * ************************************************************************/
503
504        /**
505         * Factory method for creating a terminal symbol with the given
506         * {@code name} and {@code value}.
507         *
508         * @param name the name of the terminal symbol
509         * @param value the value of the terminal symbol
510         * @param <T> the terminal symbol value type
511         * @return a new terminal symbol
512         */
513        public static <T> Terminal<T> T(final String name, final T value) {
514                return new Terminal<>(name, value);
515        }
516
517        /**
518         * Factory method for creating a terminal symbol with the given
519         * {@code name}.
520         *
521         * @param name the name of the terminal symbol
522         * @return a new terminal symbol
523         */
524        public static Terminal<String> T(final String name) {
525                return new Terminal<>(name, name);
526        }
527
528        /**
529         * Factory method for creating non-terminal symbols.
530         *
531         * @param name the name of the symbol.
532         * @param <T> the terminal symbol value type
533         * @return a new non-terminal symbol
534         */
535        public static <T> NonTerminal<T> N(final String name) {
536                return new NonTerminal<>(name);
537        }
538
539        /**
540         * Factory method for creating an expression with the given
541         * {@code symbols}.
542         *
543         * @param symbols the list of symbols of the expression
544         * @throws IllegalArgumentException if the list of {@code symbols} is
545         *         empty
546         * @param <T> the terminal symbol value type
547         * @return a new expression
548         */
549        @SafeVarargs
550        public static <T> Expression<T> E(final Symbol<T>... symbols) {
551                return new Expression<>(List.of(symbols));
552        }
553
554        /**
555         * Factory method for creating a new rule.
556         *
557         * @param name the name of start symbol of the rule
558         * @param alternatives the list af alternative rule expressions
559         * @throws IllegalArgumentException if the given list of
560         *         {@code alternatives} is empty
561         * @throws NullPointerException if one of the arguments is {@code null}
562         * @param <T> the terminal symbol value type
563         * @return a new rule
564         */
565        @SafeVarargs
566        public static <T> Rule<T> R(
567                final String name,
568                final Expression<T>... alternatives
569        ) {
570                return new Rule<>(new NonTerminal<>(name), List.of(alternatives));
571        }
572
573}