001/* 002 * Java Genetic Algorithm Library (jenetics-7.2.0). 003 * Copyright (c) 2007-2023 Franz Wilhelmstötter 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 * 017 * Author: 018 * Franz Wilhelmstötter (franz.wilhelmstoetter@gmail.com) 019 */ 020package io.jenetics.ext.grammar; 021 022import static java.lang.String.format; 023import static java.util.Objects.requireNonNull; 024import static java.util.stream.Collectors.groupingBy; 025import static java.util.stream.Collectors.toCollection; 026 027import java.util.ArrayList; 028import java.util.HashMap; 029import java.util.HashSet; 030import java.util.LinkedHashMap; 031import java.util.List; 032import java.util.Map; 033import java.util.Optional; 034import java.util.Set; 035import java.util.function.Function; 036import java.util.stream.Collectors; 037import java.util.stream.Stream; 038 039/** 040 * Represents a <em>context-free</em> grammar 041 * (<a href="https://en.wikipedia.org/wiki/Context-free_grammar"><b>CFG</b></a>). 042 * <p> 043 * <b>Formal definition</b> 044 * <p> 045 * A context-free grammar {@code G} is defined by the 4-tuple 046 * {@code G = (N, T, R, S)}, where 047 * <ul> 048 * <li>{@code N} is a finite set; each element {@code n ∈ N} is called a 049 * non-terminal ({@link NonTerminal}) character or a variable. Each 050 * variable represents a different type of phrase or clause in the sentence. 051 * Variables are also sometimes called syntactic categories. Each variable 052 * defines a sub-language of the language defined by {@code G}. 053 * </li> 054 * <li>{@code T} is a finite set of terminals ({@link Terminal}) disjoint 055 * from {@code N}, which make up the actual content of the sentence. The set 056 * of terminals is the alphabet of the language defined by the grammar 057 * {@code G}. 058 * </li> 059 * <li>{@code R} is a finite relation in {@code N × (N ∪ T)∗}, where the 060 * asterisk represents the <a href="https://en.wikipedia.org/wiki/Kleene_star"> 061 * Kleene star</a> operation. The members of {@code R} are called the 062 * (rewrite) rules ({@link Rule}) or productions of the grammar. 063 * </li> 064 * <li>{@code S} is the start variable (or start symbol), used to represent 065 * the whole sentence (or program). It must be an element of {@code N} 066 * ({@link NonTerminal}) 067 * .</li> 068 * </ul> 069 * 070 * You can easily create a <em>Cfg</em> object from a given BNF grammar. 071 * <pre>{@code 072 * final Cfg<String> grammar = Bnf.parse(""" 073 * <expr> ::= <num> | <var> | '(' <expr> <op> <expr> ')' 074 * <op> ::= + | - | * | / 075 * <var> ::= x | y 076 * <num> ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 077 * """ 078 * ); 079 * }</pre> 080 * 081 * It is also possible to create the grammar above programmatically. 082 * <pre>{@code 083 * final Cfg<String> grammar = Cfg.of( 084 * R("expr", 085 * E(NT("num")), 086 * E(NT("var")), 087 * E(T("("), NT("expr"), NT("op"), NT("expr"), T(")")) 088 * ), 089 * R("op", E(T("+")), E(T("-")), E(T("*")), E(T("/"))), 090 * R("var", E(T("x")), E(T("y"))), 091 * R("num", 092 * E(T("0")), E(T("1")), E(T("2")), E(T("3")), 093 * E(T("4")), E(T("5")), E(T("6")), E(T("7")), 094 * E(T("8")), E(T("9")) 095 * ) 096 * ); 097 * }</pre> 098 * 099 * @see Bnf#parse(String) 100 * 101 * @param <T> the terminal symbol value type 102 * 103 * @author <a href="mailto:franz.wilhelmstoetter@gmail.com">Franz Wilhelmstötter</a> 104 * @since 7.1 105 * @version 7.1 106 */ 107public record Cfg<T>( 108 List<NonTerminal<T>> nonTerminals, 109 List<Terminal<T>> terminals, 110 List<Rule<T>> rules, 111 NonTerminal<T> start 112) { 113 114 /** 115 * Represents the <em>symbols</em> the BNF grammar consists. 116 * 117 * @param <T> the terminal symbol value type 118 */ 119 public sealed interface Symbol<T> { 120 121 /** 122 * Return the name of the symbol. 123 * 124 * @return the name of the symbol 125 */ 126 String name(); 127 128 } 129 130 /** 131 * Represents the non-terminal symbols of the grammar ({@code NT}). 132 * 133 * @param <T> the terminal symbol value type 134 */ 135 public record NonTerminal<T>(String name) implements Symbol<T> { 136 137 /** 138 * @param name the name of the non-terminal symbol 139 * @throws IllegalArgumentException if the given {@code name} is not 140 * a valid <em>non-terminal</em> name 141 * @throws NullPointerException if one of the arguments is {@code null} 142 */ 143 public NonTerminal { 144 if (name.isEmpty()) { 145 throw new IllegalArgumentException( 146 "Non-terminal value must not be empty." 147 ); 148 } 149 } 150 } 151 152 /** 153 * Represents a terminal symbols of the grammar ({@code T}). 154 * 155 * @param <T> the terminal symbol value type 156 */ 157 public record Terminal<T>(String name, T value) implements Symbol<T> { 158 159 /** 160 * @param name the name of the terminal symbol 161 * @param value the value of the terminal symbol 162 * @throws IllegalArgumentException if the given terminal {@code name} 163 * is empty 164 */ 165 public Terminal { 166 if (name.isEmpty()) { 167 throw new IllegalArgumentException( 168 "Terminal value must not be empty." 169 ); 170 } 171 } 172 173 /** 174 * Return a new terminal symbol where the name of the symbol is equal 175 * to its value. 176 * 177 * @param name the name (and value) of the terminal symbol 178 * @return a new terminal symbol with the given {@code name} 179 * @throws IllegalArgumentException if the given terminal {@code name} 180 * is empty 181 */ 182 public static Terminal<String> of(final String name) { 183 return new Terminal<>(name, name); 184 } 185 186 } 187 188 /** 189 * Represents one <em>expression</em> (list of alternative symbols) a 190 * production rule consists of. 191 * 192 * @param <T> the terminal symbol value type 193 */ 194 public record Expression<T>(List<Symbol<T>> symbols) { 195 196 /** 197 * @param symbols the list of symbols of the expression 198 * @throws IllegalArgumentException if the list of {@code symbols} is 199 * empty 200 */ 201 public Expression { 202 if (symbols.isEmpty()) { 203 throw new IllegalArgumentException( 204 "The list of symbols must not be empty." 205 ); 206 } 207 symbols = List.copyOf(symbols); 208 } 209 210 } 211 212 /** 213 * Represents a production rule of the grammar ({@code R}). 214 * 215 * @param <T> the terminal symbol value type 216 */ 217 public record Rule<T>(NonTerminal<T> start, List<Expression<T>> alternatives) { 218 219 /** 220 * Creates a new rule object. 221 * 222 * @param start the start symbol of the rule 223 * @param alternatives the list of alternative rule expressions 224 * @throws IllegalArgumentException if the given list of 225 * {@code alternatives} is empty 226 * @throws NullPointerException if one of the arguments is {@code null} 227 */ 228 public Rule { 229 requireNonNull(start); 230 if (alternatives.isEmpty()) { 231 throw new IllegalArgumentException( 232 "Rule alternatives must not be empty." 233 ); 234 } 235 alternatives = List.copyOf(alternatives); 236 } 237 238 } 239 240 /** 241 * Create a new <em>context-free</em> grammar object. 242 * 243 * @param nonTerminals the non-terminal symbols of {@code this} grammar 244 * @param terminals the terminal symbols of {@code this} grammar 245 * @param rules the <em>production</em> rules of {@code this} grammar 246 * @param start the start symbol of {@code this} grammar 247 * @throws NullPointerException if one of the arguments is {@code null} 248 * @throws IllegalArgumentException if a rule is defined more than once, the 249 * start symbol points to a missing rule or the rules uses symbols 250 * not defined in the list of {@link #nonTerminals()} or 251 * {@link #terminals()} 252 */ 253 public Cfg { 254 if (rules.isEmpty()) { 255 throw new IllegalArgumentException( 256 "The given list of rules must not be empty." 257 ); 258 } 259 260 // Check the uniqueness of the rules. 261 final var duplicatedRules = rules.stream() 262 .collect(Collectors.groupingBy(Rule::start)) 263 .values().stream() 264 .filter(values -> values.size() > 1) 265 .map(rule -> rule.get(0).start.name) 266 .toList(); 267 268 if (!duplicatedRules.isEmpty()) { 269 throw new IllegalArgumentException( 270 "Found duplicate rule(s), " + duplicatedRules + "." 271 ); 272 } 273 274 // Check if start symbol points to an existing rule. 275 final var startRule = rules.stream() 276 .filter(r -> start.equals(r.start)) 277 .findFirst(); 278 if (startRule.isEmpty()) { 279 throw new IllegalArgumentException( 280 "No rule found for start symbol %s.".formatted(start) 281 ); 282 } 283 284 // Check that all symbols used in the given rules are also defined 285 // in the list of non-terminals and terminals. 286 final Set<Symbol<T>> required = rules.stream() 287 .flatMap(Cfg::ruleSymbols) 288 .collect(Collectors.toUnmodifiableSet()); 289 290 final Set<Symbol<T>> available = Stream 291 .concat(nonTerminals.stream(), terminals.stream()) 292 .collect(Collectors.toUnmodifiableSet()); 293 294 final var missing = new HashSet<>(required); 295 missing.removeAll(available); 296 297 if (!missing.isEmpty()) { 298 throw new IllegalArgumentException( 299 "Unknown symbols defined in rules: " + missing 300 ); 301 } 302 303 // Check if the name of terminals and non-terminals are distinct. 304 final var terminalNames = terminals.stream() 305 .map(Symbol::name) 306 .collect(Collectors.toSet()); 307 308 final var nonTerminalNames = nonTerminals.stream() 309 .map(Symbol::name) 310 .collect(Collectors.toSet()); 311 312 terminalNames.retainAll(nonTerminalNames); 313 if (!terminalNames.isEmpty()) { 314 throw new IllegalArgumentException(format( 315 "Terminal and non-terminal symbols with same name: %s", 316 terminalNames.stream().sorted().toList() 317 )); 318 } 319 320 nonTerminals = List.copyOf(nonTerminals); 321 terminals = List.copyOf(terminals); 322 rules = List.copyOf(rules); 323 requireNonNull(start); 324 } 325 326 /** 327 * Return the rule for the given {@code start} symbol. 328 * 329 * @param start the start symbol of the rule 330 * @return the rule for the given {@code start} symbol 331 * @throws NullPointerException if the given {@code start} symbol is 332 * {@code null} 333 */ 334 public Optional<Rule<T>> rule(final NonTerminal<?> start) { 335 requireNonNull(start); 336 for (var rule : rules) { 337 if (rule.start().name().equals(start.name())) { 338 return Optional.of(rule); 339 } 340 } 341 return Optional.empty(); 342 } 343 344 /** 345 * Maps the values of the terminal symbols from type {@code T} to type 346 * {@code A}. 347 * 348 * @param mapper the mapper function 349 * @param <A> the new value type of the terminal symbols 350 * @return the mapped grammar 351 * @throws NullPointerException if the given mapper is {@code null} 352 */ 353 public <A> Cfg<A> map(final Function<? super Terminal<T>, ? extends A> mapper) { 354 requireNonNull(mapper); 355 356 final var cache = new HashMap<Terminal<T>, Terminal<A>>(); 357 final Function<Terminal<T>, Terminal<A>> mapping = t -> cache 358 .computeIfAbsent(t, t2 -> new Terminal<>(t2.name(), mapper.apply(t2))); 359 360 @SuppressWarnings("unchecked") 361 final List<Rule<A>> rules = rules().stream() 362 .map(rule -> new Rule<>( 363 (NonTerminal<A>)rule.start(), 364 rule.alternatives().stream() 365 .map(expr -> new Expression<>( 366 expr.symbols().stream() 367 .map(sym -> sym instanceof Cfg.Terminal<T> t 368 ? mapping.apply(t) : (Symbol<A>)sym) 369 .toList() 370 )) 371 .toList() 372 )) 373 .toList(); 374 375 return Cfg.of(rules); 376 } 377 378 /** 379 * Create a grammar object with the given rules. Duplicated rules are merged 380 * into one rule. The <em>start</em> symbol of the first rule is chosen as 381 * the start symbol of the created CFG 382 * 383 * @param rules the rules the grammar consists of 384 * @throws IllegalArgumentException if the list of rules is empty 385 * @throws NullPointerException if the list of rules is {@code null} 386 */ 387 public static <T> Cfg<T> of(final List<Rule<T>> rules) { 388 if (rules.isEmpty()) { 389 throw new IllegalArgumentException( 390 "The list of rules must not be empty." 391 ); 392 } 393 394 final List<Rule<T>> normalizedRules = normalize(rules); 395 396 final List<Symbol<T>> symbols = normalizedRules.stream() 397 .flatMap(Cfg::ruleSymbols) 398 .distinct() 399 .toList(); 400 401 final List<NonTerminal<T>> nonTerminals = symbols.stream() 402 .filter(NonTerminal.class::isInstance) 403 .map(nt -> (NonTerminal<T>)nt) 404 .toList(); 405 406 final List<Terminal<T>> terminals = symbols.stream() 407 .filter(Terminal.class::isInstance) 408 .map(nt -> (Terminal<T>)nt) 409 .toList(); 410 411 return new Cfg<>( 412 nonTerminals, 413 terminals, 414 normalizedRules.stream() 415 .map(r -> rebuild(r, symbols)) 416 .toList(), 417 (NonTerminal<T>)select(normalizedRules.get(0).start(), symbols) 418 ); 419 } 420 421 /** 422 * Create a grammar object with the given rules. Duplicated rules are merged 423 * into one rule. The <em>start</em> symbol of the first rule is chosen as 424 * the start symbol of the created CFG 425 * 426 * @param rules the rules the grammar consists of 427 * @throws IllegalArgumentException if the list of rules is empty 428 * @throws NullPointerException if the list of rules is {@code null} 429 */ 430 @SafeVarargs 431 public static <T> Cfg<T> of(final Rule<T>... rules) { 432 return Cfg.of(List.of(rules)); 433 } 434 435 private static <T> List<Rule<T>> normalize(final List<Rule<T>> rules) { 436 final Map<NonTerminal<T>, List<Rule<T>>> grouped = rules.stream() 437 .collect(groupingBy( 438 Rule::start, 439 LinkedHashMap::new, 440 toCollection(ArrayList::new))); 441 442 return grouped.entrySet().stream() 443 .map(entry -> merge(entry.getKey(), entry.getValue())) 444 .toList(); 445 } 446 447 private static <T> Rule<T> merge(final NonTerminal<T> start, final List<Rule<T>> rules) { 448 return new Rule<>( 449 start, 450 rules.stream() 451 .flatMap(rule -> rule.alternatives().stream()) 452 .toList() 453 ); 454 } 455 456 private static <T> Stream<Symbol<T>> ruleSymbols(final Rule<T> rule) { 457 return Stream.concat( 458 Stream.of(rule.start), 459 rule.alternatives.stream() 460 .flatMap(expr -> expr.symbols().stream()) 461 ); 462 } 463 464 private static <T> Rule<T> rebuild(final Rule<T> rule, final List<Symbol<T>> symbols) { 465 return new Rule<>( 466 (NonTerminal<T>)select(rule.start, symbols), 467 rule.alternatives.stream() 468 .map(e -> rebuild(e, symbols)) 469 .toList() 470 ); 471 } 472 473 private static <T> Expression<T> 474 rebuild(final Expression<T> expression, final List<Symbol<T>> symbols) { 475 return new Expression<>( 476 expression.symbols.stream() 477 .map(s -> select(s, symbols)) 478 .toList() 479 ); 480 } 481 482 private static <T> Symbol<T> select( 483 final Symbol<T> symbol, 484 final List<Symbol<T>> symbols 485 ) { 486 for (var s : symbols) { 487 if (s.name().equals(symbol.name())) { 488 return s; 489 } 490 } 491 throw new AssertionError("Symbol not found: " + symbol); 492 } 493 494 @SuppressWarnings("unchecked") 495 static <A, B extends A> Cfg<A> upcast(final Cfg<B> seq) { 496 return (Cfg<A>)seq; 497 } 498 499 500 /* ************************************************************************* 501 * Static factory methods for rule creation. 502 * ************************************************************************/ 503 504 /** 505 * Factory method for creating a terminal symbol with the given 506 * {@code name} and {@code value}. 507 * 508 * @param name the name of the terminal symbol 509 * @param value the value of the terminal symbol 510 * @param <T> the terminal symbol value type 511 * @return a new terminal symbol 512 */ 513 public static <T> Terminal<T> T(final String name, final T value) { 514 return new Terminal<>(name, value); 515 } 516 517 /** 518 * Factory method for creating a terminal symbol with the given 519 * {@code name}. 520 * 521 * @param name the name of the terminal symbol 522 * @return a new terminal symbol 523 */ 524 public static Terminal<String> T(final String name) { 525 return new Terminal<>(name, name); 526 } 527 528 /** 529 * Factory method for creating non-terminal symbols. 530 * 531 * @param name the name of the symbol. 532 * @param <T> the terminal symbol value type 533 * @return a new non-terminal symbol 534 */ 535 public static <T> NonTerminal<T> N(final String name) { 536 return new NonTerminal<>(name); 537 } 538 539 /** 540 * Factory method for creating an expression with the given 541 * {@code symbols}. 542 * 543 * @param symbols the list of symbols of the expression 544 * @throws IllegalArgumentException if the list of {@code symbols} is 545 * empty 546 * @param <T> the terminal symbol value type 547 * @return a new expression 548 */ 549 @SafeVarargs 550 public static <T> Expression<T> E(final Symbol<T>... symbols) { 551 return new Expression<>(List.of(symbols)); 552 } 553 554 /** 555 * Factory method for creating a new rule. 556 * 557 * @param name the name of start symbol of the rule 558 * @param alternatives the list af alternative rule expressions 559 * @throws IllegalArgumentException if the given list of 560 * {@code alternatives} is empty 561 * @throws NullPointerException if one of the arguments is {@code null} 562 * @param <T> the terminal symbol value type 563 * @return a new rule 564 */ 565 @SafeVarargs 566 public static <T> Rule<T> R( 567 final String name, 568 final Expression<T>... alternatives 569 ) { 570 return new Rule<>(new NonTerminal<>(name), List.of(alternatives)); 571 } 572 573}