001/*
002 * Java Genetic Algorithm Library (jenetics-8.3.0).
003 * Copyright (c) 2007-2025 Franz Wilhelmstötter
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 *
017 * Author:
018 *    Franz Wilhelmstötter (franz.wilhelmstoetter@gmail.com)
019 */
020package io.jenetics.prog.regression;
021
022import static java.lang.Math.pow;
023import static java.lang.String.format;
024import static java.util.Objects.requireNonNull;
025
026import java.util.ArrayList;
027import java.util.List;
028import java.util.function.Function;
029import java.util.function.Predicate;
030import java.util.stream.Collectors;
031
032import io.jenetics.Genotype;
033import io.jenetics.engine.Codec;
034import io.jenetics.engine.Problem;
035import io.jenetics.util.ISeq;
036
037import io.jenetics.ext.util.Tree;
038
039import io.jenetics.prog.ProgramChromosome;
040import io.jenetics.prog.ProgramGene;
041import io.jenetics.prog.op.Op;
042import io.jenetics.prog.regression.Sampling.Result;
043
044/**
045 * This class implements a <em>symbolic</em> regression problem. The example
046 * below shows a typical usage of the {@code Regression} class.
047 * {@snippet lang="java":
048 * public class SymbolicRegression {
049 *     private static final ISeq<Op<Double>> OPERATIONS =
050 *         ISeq.of(MathOp.ADD, MathOp.SUB, MathOp.MUL);
051 *
052 *     private static final ISeq<Op<Double>> TERMINALS = ISeq.of(
053 *         Var.of("x", 0),
054 *         EphemeralConst.of(() -> (double)RandomRegistry.random().nextInt(10))
055 *     );
056 *
057 *     private static final Regression<Double> REGRESSION = Regression.of(
058 *         Regression.codecOf(OPERATIONS, TERMINALS, 5),
059 *         Error.of(LossFunction::mse),
060 *         Sample.ofDouble(-1.0, -8.0000),
061 *         // ...
062 *         Sample.ofDouble(0.9, 1.3860),
063 *         Sample.ofDouble(1.0, 2.0000)
064 *     );
065 *
066 *     public static void main(final String[] args) {
067 *         final Engine<ProgramGene<Double>, Double> engine = Engine
068 *             .builder(REGRESSION)
069 *             .minimizing()
070 *             .alterers(
071 *                 new SingleNodeCrossover<>(0.1),
072 *                 new Mutator<>())
073 *             .build();
074 *
075 *         final EvolutionResult<ProgramGene<Double>, Double> result = engine.stream()
076 *             .limit(Limits.byFitnessThreshold(0.01))
077 *             .collect(EvolutionResult.toBestEvolutionResult());
078 *
079 *         final ProgramGene<Double> program = result.bestPhenotype()
080 *             .genotype()
081 *             .gene();
082 *
083 *         final TreeNode<Op<Double>> tree = program.toTreeNode();
084 *         MathExpr.rewrite(tree); // Simplify result program.
085 *         System.out.println("Generations: " + result.totalGenerations());
086 *         System.out.println("Function:    " + new MathExpr(tree));
087 *         System.out.println("Error:       " + REGRESSION.error(tree));
088 *     }
089 * }
090 * }
091 *
092 * @see SampleBuffer
093 * @see Sampling
094 *
095 * @param <T> the operation type
096 *
097 * @author <a href="mailto:franz.wilhelmstoetter@gmail.com">Franz Wilhelmstötter</a>
098 * @version 6.0
099 * @since 5.0
100 */
101public final class Regression<T>
102        implements Problem<Tree<Op<T>, ?>, ProgramGene<T>, Double>
103{
104
105        private final Codec<Tree<Op<T>, ?>, ProgramGene<T>> _codec;
106        private final Error<T> _error;
107        private final Sampling<T> _sampling;
108
109
110        /**
111         * Create a new <em>symbolic</em> regression problem with the given data.
112         *
113         * @param codec the codec used for the problem
114         * @param error the error function
115         * @param sampling the sample values used for finding a regression.
116         */
117        private Regression(
118                final Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec,
119                final Error<T> error,
120                final Sampling<T> sampling
121        ) {
122                _codec = requireNonNull(codec);
123                _error = requireNonNull(error);
124                _sampling = requireNonNull(sampling);
125        }
126
127        @Override
128        public Function<Tree<Op<T>, ?>, Double> fitness() {
129                return this::error;
130        }
131
132        @Override
133        public Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec() {
134                return _codec;
135        }
136
137        /**
138         * Calculates the actual error for the given {@code program}.
139         *
140         * @param program the program to calculate the error value for
141         * @return the overall error value of the program
142         */
143        public double error(final Tree<? extends Op<T>, ?> program) {
144                final Result<T> result = _sampling.eval(program);
145                return result != null
146                        ? _error.apply(program, result.calculated(), result.expected())
147                        : Double.MAX_VALUE;
148        }
149
150        /* *************************************************************************
151         * Factory methods.
152         * ************************************************************************/
153
154        /**
155         * Create a new regression problem instance with the given parameters.
156         *
157         * @see #codecOf(ISeq, ISeq, int)
158         * @see #codecOf(ISeq, ISeq, int, Predicate)
159         *
160         * @param <T> the operation type
161         * @param codec the problem codec to use
162         * @param error the error function
163         * @param sampling the sampling function
164         * @return a new regression problem instance
165         * @throws NullPointerException if on of the arguments is {@code null}
166         */
167        public static <T> Regression<T> of(
168                final Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec,
169                final Error<T> error,
170                final Sampling<T> sampling
171        ) {
172                return new Regression<>(codec, error, sampling);
173        }
174
175        /**
176         * Create a new regression problem instance with the given parameters.
177         *
178         * @see #codecOf(ISeq, ISeq, int)
179         * @see #codecOf(ISeq, ISeq, int, Predicate)
180         *
181         * @param <T> the operation type
182         * @param codec the problem codec to use
183         * @param error the error function
184         * @param samples the sample points used for regression analysis
185         * @return a new regression problem instance
186         * @throws IllegalArgumentException if the given {@code samples} is empty
187         * @throws NullPointerException if one of the arguments is {@code null}
188         */
189        public static <T> Regression<T> of(
190                final Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec,
191                final Error<T> error,
192                final Iterable<? extends Sample<T>> samples
193        ) {
194                if (!samples.iterator().hasNext()) {
195                        throw new IllegalArgumentException("Sample list must not be empty.");
196                }
197
198                final List<Sample<T>> s = new ArrayList<>();
199                samples.forEach(s::add);
200
201                return new Regression<>(codec, error, new SampleList<>(s));
202        }
203
204        /**
205         * Create a new regression problem instance with the given parameters.
206         *
207         * @see #codecOf(ISeq, ISeq, int)
208         * @see #codecOf(ISeq, ISeq, int, Predicate)
209         *
210         * @param <T> the operation type
211         * @param codec the problem codec to use
212         * @param error the error function
213         * @param samples the sample points used for regression analysis
214         * @return a new regression problem instance
215         * @throws IllegalArgumentException if the given {@code samples} is empty
216         * @throws NullPointerException if one of the arguments is {@code null}
217         */
218        @SafeVarargs
219        public static <T> Regression<T> of(
220                final Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec,
221                final Error<T> error,
222                final Sample<T>... samples
223        ) {
224                return of(codec, error, List.of(samples));
225        }
226
227
228        /* *************************************************************************
229         * Codec factory methods.
230         * ************************************************************************/
231
232        /**
233         * Create a new <em>codec</em>, usable for <em>symbolic regression</em>
234         * problems, with the given parameters.
235         *
236         * @param <T> the operation type
237         * @param operations the operations used for the symbolic regression
238         * @param terminals the terminal operations of the program tree
239         * @param depth the maximal tree depth (height) of newly created program
240         *        trees
241         * @param validator the chromosome validator. A typical validator would
242         *        check the size of the tree and if the tree is too large, mark it
243         *        at <em>invalid</em>. The <em>validator</em> may be {@code null}.
244         * @return a new codec, usable for symbolic regression
245         * @throws IllegalArgumentException if the tree {@code depth} is not in the
246         *         valid range of {@code [0, 30)}
247         * @throws NullPointerException if the {@code operations} or {@code terminals}
248         *         are {@code null}
249         */
250        public static <T> Codec<Tree<Op<T>, ?>, ProgramGene<T>>
251        codecOf(
252                final ISeq<Op<T>> operations,
253                final ISeq<Op<T>> terminals,
254                final int depth,
255                final Predicate<? super ProgramChromosome<T>> validator
256        ) {
257                if (depth >= 30 || depth < 0) {
258                        throw new IllegalArgumentException(format(
259                                "Tree depth out of range [0, 30): %d", depth
260                        ));
261                }
262
263                return Codec.of(
264                        Genotype.of(
265                                ProgramChromosome.of(
266                                        depth,
267                                        validator,
268                                        operations,
269                                        terminals
270                                )
271                        ),
272                        Genotype::gene
273                );
274        }
275
276        /**
277         * Create a new <em>codec</em>, usable for <em>symbolic regression</em>
278         * problems, with the given parameters.
279         *
280         * @param <T> the operation type
281         * @param operations the operations used for the symbolic regression
282         * @param terminals the terminal operations of the program tree
283         * @param depth the maximal tree depth (height) of newly created program
284         *        trees
285         * @return a new codec, usable for symbolic regression
286         * @throws IllegalArgumentException if the tree {@code depth} is not in the
287         *         valid range of {@code [0, 30)}
288         * @throws NullPointerException if the {@code operations} or {@code terminals}
289         *         are {@code null}
290         */
291        public static <T> Codec<Tree<Op<T>, ?>, ProgramGene<T>>
292        codecOf(
293                final ISeq<Op<T>> operations,
294                final ISeq<Op<T>> terminals,
295                final int depth
296        ) {
297                // Average arity of tree nodes.
298                final double k = operations.stream()
299                        .collect(Collectors.averagingDouble(Op::arity));
300
301                // The average node count between treeDepth and treeDepth + 1.
302                // 2^(k + 1) - 1 + 2^(k + 2) - 1)/2 == 3*2^k - 1
303                final int max = (int)(3*pow(k, depth) - 1);
304
305                return codecOf(
306                        operations,
307                        terminals,
308                        depth,
309                        ch -> ch.root().size() <= max
310                );
311        }
312
313}