001/*
002 * Java Genetic Algorithm Library (jenetics-8.1.0).
003 * Copyright (c) 2007-2024 Franz Wilhelmstötter
004 *
005 * Licensed under the Apache License, Version 2.0 (the "License");
006 * you may not use this file except in compliance with the License.
007 * You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 *
017 * Author:
018 *    Franz Wilhelmstötter (franz.wilhelmstoetter@gmail.com)
019 */
020package io.jenetics.prog.regression;
021
022import static java.lang.Math.pow;
023import static java.lang.String.format;
024import static java.util.Objects.requireNonNull;
025
026import java.util.ArrayList;
027import java.util.List;
028import java.util.function.Function;
029import java.util.function.Predicate;
030import java.util.stream.Collectors;
031
032import io.jenetics.Genotype;
033import io.jenetics.engine.Codec;
034import io.jenetics.engine.Problem;
035import io.jenetics.util.ISeq;
036
037import io.jenetics.ext.util.Tree;
038
039import io.jenetics.prog.ProgramChromosome;
040import io.jenetics.prog.ProgramGene;
041import io.jenetics.prog.op.Op;
042import io.jenetics.prog.regression.Sampling.Result;
043
044/**
045 * This class implements a <em>symbolic</em> regression problem. The example
046 * below shows a typical usage of the {@code Regression} class.
047 *
048 * {@snippet lang="java":
049 * public class SymbolicRegression {
050 *     private static final ISeq<Op<Double>> OPERATIONS =
051 *         ISeq.of(MathOp.ADD, MathOp.SUB, MathOp.MUL);
052 *
053 *     private static final ISeq<Op<Double>> TERMINALS = ISeq.of(
054 *         Var.of("x", 0),
055 *         EphemeralConst.of(() -> (double)RandomRegistry.random().nextInt(10))
056 *     );
057 *
058 *     private static final Regression<Double> REGRESSION = Regression.of(
059 *         Regression.codecOf(OPERATIONS, TERMINALS, 5),
060 *         Error.of(LossFunction::mse),
061 *         Sample.ofDouble(-1.0, -8.0000),
062 *         // ...
063 *         Sample.ofDouble(0.9, 1.3860),
064 *         Sample.ofDouble(1.0, 2.0000)
065 *     );
066 *
067 *     public static void main(final String[] args) {
068 *         final Engine<ProgramGene<Double>, Double> engine = Engine
069 *             .builder(REGRESSION)
070 *             .minimizing()
071 *             .alterers(
072 *                 new SingleNodeCrossover<>(0.1),
073 *                 new Mutator<>())
074 *             .build();
075 *
076 *         final EvolutionResult<ProgramGene<Double>, Double> result = engine.stream()
077 *             .limit(Limits.byFitnessThreshold(0.01))
078 *             .collect(EvolutionResult.toBestEvolutionResult());
079 *
080 *         final ProgramGene<Double> program = result.bestPhenotype()
081 *             .genotype()
082 *             .gene();
083 *
084 *         final TreeNode<Op<Double>> tree = program.toTreeNode();
085 *         MathExpr.rewrite(tree); // Simplify result program.
086 *         System.out.println("Generations: " + result.totalGenerations());
087 *         System.out.println("Function:    " + new MathExpr(tree));
088 *         System.out.println("Error:       " + REGRESSION.error(tree));
089 *     }
090 * }
091 * }
092 *
093 * @see SampleBuffer
094 * @see Sampling
095 *
096 * @param <T> the operation type
097 *
098 * @author <a href="mailto:franz.wilhelmstoetter@gmail.com">Franz Wilhelmstötter</a>
099 * @version 6.0
100 * @since 5.0
101 */
102public final class Regression<T>
103        implements Problem<Tree<Op<T>, ?>, ProgramGene<T>, Double>
104{
105
106        private final Codec<Tree<Op<T>, ?>, ProgramGene<T>> _codec;
107        private final Error<T> _error;
108        private final Sampling<T> _sampling;
109
110
111        /**
112         * Create a new <em>symbolic</em> regression problem with the given data.
113         *
114         * @param codec the codec used for the problem
115         * @param error the error function
116         * @param sampling the sample values used for finding a regression.
117         */
118        private Regression(
119                final Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec,
120                final Error<T> error,
121                final Sampling<T> sampling
122        ) {
123                _codec = requireNonNull(codec);
124                _error = requireNonNull(error);
125                _sampling = requireNonNull(sampling);
126        }
127
128        @Override
129        public Function<Tree<Op<T>, ?>, Double> fitness() {
130                return this::error;
131        }
132
133        @Override
134        public Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec() {
135                return _codec;
136        }
137
138        /**
139         * Calculates the actual error for the given {@code program}.
140         *
141         * @param program the program to calculate the error value for
142         * @return the overall error value of the program
143         */
144        public double error(final Tree<? extends Op<T>, ?> program) {
145                final Result<T> result = _sampling.eval(program);
146                return result != null
147                        ? _error.apply(program, result.calculated(), result.expected())
148                        : Double.MAX_VALUE;
149        }
150
151        /* *************************************************************************
152         * Factory methods.
153         * ************************************************************************/
154
155        /**
156         * Create a new regression problem instance with the given parameters.
157         *
158         * @see #codecOf(ISeq, ISeq, int)
159         * @see #codecOf(ISeq, ISeq, int, Predicate)
160         *
161         * @param <T> the operation type
162         * @param codec the problem codec to use
163         * @param error the error function
164         * @param sampling the sampling function
165         * @return a new regression problem instance
166         * @throws NullPointerException if on of the arguments is {@code null}
167         */
168        public static <T> Regression<T> of(
169                final Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec,
170                final Error<T> error,
171                final Sampling<T> sampling
172        ) {
173                return new Regression<>(codec, error, sampling);
174        }
175
176        /**
177         * Create a new regression problem instance with the given parameters.
178         *
179         * @see #codecOf(ISeq, ISeq, int)
180         * @see #codecOf(ISeq, ISeq, int, Predicate)
181         *
182         * @param <T> the operation type
183         * @param codec the problem codec to use
184         * @param error the error function
185         * @param samples the sample points used for regression analysis
186         * @return a new regression problem instance
187         * @throws IllegalArgumentException if the given {@code samples} is empty
188         * @throws NullPointerException if one of the arguments is {@code null}
189         */
190        public static <T> Regression<T> of(
191                final Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec,
192                final Error<T> error,
193                final Iterable<? extends Sample<T>> samples
194        ) {
195                if (!samples.iterator().hasNext()) {
196                        throw new IllegalArgumentException("Sample list must not be empty.");
197                }
198
199                final List<Sample<T>> s = new ArrayList<>();
200                samples.forEach(s::add);
201
202                return new Regression<>(codec, error, new SampleList<>(s));
203        }
204
205        /**
206         * Create a new regression problem instance with the given parameters.
207         *
208         * @see #codecOf(ISeq, ISeq, int)
209         * @see #codecOf(ISeq, ISeq, int, Predicate)
210         *
211         * @param <T> the operation type
212         * @param codec the problem codec to use
213         * @param error the error function
214         * @param samples the sample points used for regression analysis
215         * @return a new regression problem instance
216         * @throws IllegalArgumentException if the given {@code samples} is empty
217         * @throws NullPointerException if one of the arguments is {@code null}
218         */
219        @SafeVarargs
220        public static <T> Regression<T> of(
221                final Codec<Tree<Op<T>, ?>, ProgramGene<T>> codec,
222                final Error<T> error,
223                final Sample<T>... samples
224        ) {
225                return of(codec, error, List.of(samples));
226        }
227
228
229        /* *************************************************************************
230         * Codec factory methods.
231         * ************************************************************************/
232
233        /**
234         * Create a new <em>codec</em>, usable for <em>symbolic regression</em>
235         * problems, with the given parameters.
236         *
237         * @param <T> the operation type
238         * @param operations the operations used for the symbolic regression
239         * @param terminals the terminal operations of the program tree
240         * @param depth the maximal tree depth (height) of newly created program
241         *        trees
242         * @param validator the chromosome validator. A typical validator would
243         *        check the size of the tree and if the tree is too large, mark it
244         *        at <em>invalid</em>. The <em>validator</em> may be {@code null}.
245         * @return a new codec, usable for symbolic regression
246         * @throws IllegalArgumentException if the tree {@code depth} is not in the
247         *         valid range of {@code [0, 30)}
248         * @throws NullPointerException if the {@code operations} or {@code terminals}
249         *         are {@code null}
250         */
251        public static <T> Codec<Tree<Op<T>, ?>, ProgramGene<T>>
252        codecOf(
253                final ISeq<Op<T>> operations,
254                final ISeq<Op<T>> terminals,
255                final int depth,
256                final Predicate<? super ProgramChromosome<T>> validator
257        ) {
258                if (depth >= 30 || depth < 0) {
259                        throw new IllegalArgumentException(format(
260                                "Tree depth out of range [0, 30): %d", depth
261                        ));
262                }
263
264                return Codec.of(
265                        Genotype.of(
266                                ProgramChromosome.of(
267                                        depth,
268                                        validator,
269                                        operations,
270                                        terminals
271                                )
272                        ),
273                        Genotype::gene
274                );
275        }
276
277        /**
278         * Create a new <em>codec</em>, usable for <em>symbolic regression</em>
279         * problems, with the given parameters.
280         *
281         * @param <T> the operation type
282         * @param operations the operations used for the symbolic regression
283         * @param terminals the terminal operations of the program tree
284         * @param depth the maximal tree depth (height) of newly created program
285         *        trees
286         * @return a new codec, usable for symbolic regression
287         * @throws IllegalArgumentException if the tree {@code depth} is not in the
288         *         valid range of {@code [0, 30)}
289         * @throws NullPointerException if the {@code operations} or {@code terminals}
290         *         are {@code null}
291         */
292        public static <T> Codec<Tree<Op<T>, ?>, ProgramGene<T>>
293        codecOf(
294                final ISeq<Op<T>> operations,
295                final ISeq<Op<T>> terminals,
296                final int depth
297        ) {
298                // Average arity of tree nodes.
299                final double k = operations.stream()
300                        .collect(Collectors.averagingDouble(Op::arity));
301
302                // The average node count between treeDepth and treeDepth + 1.
303                // 2^(k + 1) - 1 + 2^(k + 2) - 1)/2 == 3*2^k - 1
304                final int max = (int)(3*pow(k, depth) - 1);
305
306                return codecOf(
307                        operations,
308                        terminals,
309                        depth,
310                        ch -> ch.root().size() <= max
311                );
312        }
313
314}