001/* 002 * Java Genetic Algorithm Library (jenetics-8.3.0). 003 * Copyright (c) 2007-2025 Franz Wilhelmstötter 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 * 017 * Author: 018 * Franz Wilhelmstötter (franz.wilhelmstoetter@gmail.com) 019 */ 020package io.jenetics.ext.util; 021 022import static java.util.Objects.requireNonNull; 023 024import java.io.IOException; 025import java.io.UncheckedIOException; 026import java.nio.CharBuffer; 027import java.util.Arrays; 028import java.util.List; 029import java.util.Objects; 030import java.util.function.Function; 031import java.util.function.Supplier; 032import java.util.stream.Collector; 033import java.util.stream.Collectors; 034import java.util.stream.Stream; 035 036import io.jenetics.internal.util.Lifecycle.IOValue; 037 038/** 039 * This class contains helper classes, which are the building blocks for handling 040 * CSV files. 041 * <ul> 042 * <li>{@link LineReader}: This class allows you to read the lines of a 043 * CSV file. The result will be a {@link Stream} of CSV lines and are 044 * not split.</li> 045 * <li>{@link LineSplitter}: This class is responsible for splitting one 046 * CSV line into column values.</li> 047 * <li>{@link ColumnIndexes}: Allows defining the projection/embedding of 048 * the split/joined column values.</li> 049 * <li>{@link ColumnJoiner}: Joining a column array into a CSV line, which 050 * can be joined into a whole CSV string.</li> 051 * </ul> 052 * <p> 053 * Additionally, this class contains a set of helper methods for CSV handling 054 * using default configurations. 055 * <p> 056 * <b>Reading and splitting CSV lines</b> 057 * {@snippet class="Snippets" region="readRows"} 058 * <p> 059 * <b>Joining columns and creating CSV string</b> 060 * {@snippet class="Snippets" region="CsvSupportSnippets.collect"} 061 * <p> 062 * <b>Parsing CSV string</b> 063 * {@snippet class="Snippets" region="parseCsv"} 064 * <p> 065 * <b>Parsing double values, given as CSV string</b> 066 * <p> 067 * Another example is to parse double values, which are given as CSV string and 068 * use this data for running a regression analysis. 069 * {@snippet class="Snippets" region="DoublesParsingSnippets.parseDoubles"} 070 * 071 * @see <a href="https://tools.ietf.org/html/rfc4180">RFC-4180</a> 072 * 073 * @author <a href="mailto:franz.wilhelmstoetter@gmail.com">Franz Wilhelmstötter</a> 074 * @version 8.2 075 * @since 8.1 076 */ 077public final class CsvSupport { 078 079 /** 080 * Holds the CSV column <em>separator</em> character. 081 * 082 * @param value the separator character 083 * 084 * @version 8.1 085 * @since 8.1 086 */ 087 public record Separator(char value) { 088 089 /** 090 * The default separator character, '{@code ,}'. 091 */ 092 public static final Separator DEFAULT = new Separator(','); 093 094 /** 095 * Creates a new Separator char object. 096 * 097 * @param value the separator character 098 * @throws IllegalArgumentException if the given separator character is 099 * a line break character 100 */ 101 public Separator { 102 if (isLineBreak(value)) { 103 throw new IllegalArgumentException( 104 "Given separator char is a line break character." 105 ); 106 } 107 } 108 } 109 110 /** 111 * Holds the CSV column <em>quote</em> character. The following excerpt from 112 * <a href="https://tools.ietf.org/html/rfc4180">RFC-4180</a> defines when 113 * a quote character has to be used. 114 * <pre> 115 * 5. Each field may or may not be enclosed in double quotes (however 116 * some programs, such as Microsoft Excel, do not use double quotes 117 * at all). If fields are not enclosed with double quotes, then 118 * double quotes may not appear inside the fields. For example: 119 * 120 * "aaa","bbb","ccc" CRLF 121 * zzz,yyy,xxx 122 * 123 * 6. Fields containing line breaks (CRLF), double quotes, and commas 124 * should be enclosed in double-quotes. For example: 125 * 126 * "aaa","b CRLF 127 * bb","ccc" CRLF 128 * zzz,yyy,xxx 129 * 130 * 7. If double-quotes are used to enclose fields, then a double-quote 131 * appearing inside a field must be escaped by preceding it with 132 * another double quote. For example: 133 * 134 * "aaa","b""bb","ccc" 135 * </pre> 136 * 137 * @param value the quote character 138 * 139 * @version 8.1 140 * @since 8.1 141 */ 142 public record Quote(char value) { 143 144 /** 145 * The default quote character, '{@code "}'. 146 */ 147 public static final Quote DEFAULT = new Quote('"'); 148 149 /** 150 * The zero '\0' character. 151 */ 152 public static final Quote ZERO = new Quote('\0'); 153 154 /** 155 * Creates a new Quote char object. 156 * 157 * @param value the quote character 158 * @throws IllegalArgumentException if the given quote character is 159 * a line break character 160 */ 161 public Quote { 162 if (isLineBreak(value)) { 163 throw new IllegalArgumentException( 164 "Given quote char is a line break character." 165 ); 166 } 167 } 168 } 169 170 /** 171 * Holds the column indexes, which should be part of the split or join 172 * operation. When used in the {@link LineSplitter}, it lets you filter the 173 * split column and define its order. When used in the {@link ColumnJoiner}, 174 * it can be used to define the column index in the resulting CSV for a 175 * given row array. 176 * 177 * @apiNote 178 * The column indexes is <em>thread-safe</em> and can be shared between 179 * different threads. 180 * 181 * @see LineSplitter 182 * @see ColumnJoiner 183 * 184 * @param values the column indexes which are part of the split result 185 * 186 * @version 8.1 187 * @since 8.1 188 */ 189 public record ColumnIndexes(int... values) { 190 191 /** 192 * Indicating that <em>all</em> columns should be part of the split 193 * result. 194 */ 195 public static final ColumnIndexes ALL = new ColumnIndexes(); 196 197 /** 198 * Create a new column indexes object. 199 * 200 * @param values the column indexes 201 */ 202 public ColumnIndexes { 203 values = values.clone(); 204 } 205 206 @Override 207 public int[] values() { 208 return values.clone(); 209 } 210 211 @Override 212 public int hashCode() { 213 return Arrays.hashCode(values); 214 } 215 216 @Override 217 public boolean equals(final Object obj) { 218 return obj instanceof ColumnIndexes ci && 219 Arrays.equals(values, ci.values); 220 } 221 222 @Override 223 public String toString() { 224 return Arrays.toString(values); 225 } 226 } 227 228 /** 229 * The newline string used for writing the CSV file: {@code \r\n}. 230 */ 231 public static final String EOL = "\r\n"; 232 233 234 private CsvSupport() { 235 } 236 237 private static boolean isLineBreak(final char c) { 238 return switch (c) { 239 case '\n', '\r' -> true; 240 default -> false; 241 }; 242 } 243 244 /** 245 * Splits the CSV file, given by the {@code reader}, into a {@link Stream} 246 * of CSV lines. The CSV is split at line breaks, as long as they are not 247 * part of a quoted column. For reading the CSV lines, the default quote 248 * character, {@link Quote#DEFAULT}, is used. 249 * 250 * @apiNote 251 * The returned stream must be closed by the caller, which also closes the 252 * CSV {@code reader}. 253 * 254 * @see #readAllLines(Readable) 255 * 256 * @param reader the CSV source reader. The reader is automatically closed 257 * when the returned line stream is closed. 258 * @return the stream of CSV lines 259 * @throws NullPointerException if the given {@code reader} is {@code null} 260 */ 261 public static Stream<String> lines(final Readable reader) { 262 return LineReader.DEFAULT.read(reader); 263 } 264 265 /** 266 * Splits the CSV file, given by the {@code reader}, into a {@code Stream} 267 * of CSV rows. The CSV is split at line breaks, as long as they are not 268 * part of a quoted column. For reading the CSV lines, the default quote 269 * character, {@link Quote#DEFAULT}, is used. Then each line is split into 270 * its columns using the default separator character. 271 * 272 * @apiNote 273 * The returned stream must be closed by the caller, which also closes the 274 * CSV {@code reader}. 275 * 276 * @see #readAllRows(Readable) 277 * 278 * @param reader the CSV source reader. The reader is automatically closed 279 * when the returned line stream is closed. 280 * @return the stream of CSV rows 281 * @throws NullPointerException if the given {@code reader} is {@code null} 282 */ 283 public static Stream<String[]> rows(final Readable reader) { 284 final var splitter = new LineSplitter(); 285 return lines(reader).map(splitter::split); 286 } 287 288 /** 289 * Splits the CSV file, given by the {@code reader}, into a {@code List} 290 * of CSV lines. The CSV is split at line breaks, as long as they are not 291 * part of a quoted column. For reading the CSV lines, the default quote 292 * character, {@link Quote#DEFAULT}, is used. 293 * 294 * @see #lines(Readable) 295 * 296 * @param reader the reader stream to split into CSV lines 297 * @return the list of CSV lines 298 * @throws NullPointerException if the given {@code reader} is {@code null} 299 * @throws IOException if reading the CSV lines fails 300 */ 301 public static List<String> readAllLines(final Readable reader) 302 throws IOException 303 { 304 try (var lines = lines(reader)) { 305 return lines.toList(); 306 } catch (UncheckedIOException e) { 307 throw e.getCause(); 308 } 309 } 310 311 /** 312 * Splits the CSV file, given by the {@code reader}, into a {@code List} 313 * of CSV lines. The CSV is split at line breaks, as long as they are not 314 * part of a quoted column. For reading the CSV lines, the default quote 315 * character, {@link Quote#DEFAULT}, is used. Then each line is split into 316 * its columns using the default separator character. 317 * 318 * @see #rows(Readable) 319 * 320 * @param reader the reader stream to split into CSV lines 321 * @return the list of CSV rows 322 * @throws NullPointerException if the given {@code reader} is {@code null} 323 * @throws IOException if reading the CSV lines fails 324 */ 325 public static List<String[]> readAllRows(final Readable reader) 326 throws IOException 327 { 328 try (var rows = rows(reader)) { 329 return rows.toList(); 330 } catch (UncheckedIOException e) { 331 throw e.getCause(); 332 } 333 } 334 335 /** 336 * Parses the given CSV string into a list of <em>records</em>. The records 337 * are created from a <em>row</em> ({@code String[]} array) by applying the 338 * given {@code mapper}. 339 * 340 * @param csv the CSV string to parse 341 * @param mapper the record mapper 342 * @return the parsed record list 343 * @param <T> the record type 344 */ 345 public static <T> List<T> parse( 346 final CharSequence csv, 347 final Function<? super String[], ? extends T> mapper 348 ) { 349 requireNonNull(csv); 350 requireNonNull(mapper); 351 352 try (var rows = rows(CharBuffer.wrap(csv))) { 353 return rows 354 .map(mapper) 355 .collect(Collectors.toUnmodifiableList()); 356 } 357 } 358 359 /** 360 * Parses the given CSV string into a list of rows. 361 * 362 * @param csv the CSV string to parse 363 * @return the parsed CSV rows 364 */ 365 public static List<String[]> parse(final CharSequence csv) { 366 return parse(csv, Function.identity()); 367 } 368 369 /** 370 * Parses the given CSV string into a list of {@code double[]} array rows. 371 * 372 * @param csv the CSV string to parse 373 * @return the parsed double data 374 */ 375 public static List<double[]> parseDoubles(final CharSequence csv) { 376 return parse(csv, CsvSupport::toDoubles); 377 } 378 379 private static double[] toDoubles(final String[] values) { 380 final var result = new double[values.length]; 381 for (int i = 0; i < result.length; ++i) { 382 result[i] = Double.parseDouble(values[i].trim()); 383 } 384 return result; 385 } 386 387 /** 388 * Splits a given CSV {@code line} into columns. The default values for the 389 * separator and quote character are used ({@link Separator#DEFAULT}, 390 * {@link Quote#DEFAULT}) for splitting the line. 391 * 392 * @param line the CSV line to split 393 * @return the split CSV lines 394 * @throws NullPointerException if the given {@code line} is {@code null} 395 */ 396 public static String[] split(final CharSequence line) { 397 return new LineSplitter().split(line); 398 } 399 400 /** 401 * Joins the given CSV {@code columns} to one CSV line. The default values 402 * for the separator and quote character are used ({@link Separator#DEFAULT}, 403 * {@link Quote#DEFAULT}) for joining the columns. 404 * 405 * @see #join(Object[]) 406 * 407 * @param columns the CSV columns to join 408 * @return the CSV line, joined from the given {@code columns} 409 * @throws NullPointerException if the given {@code columns} is {@code null} 410 */ 411 public static String join(final Iterable<?> columns) { 412 return ColumnJoiner.DEFAULT.join(columns); 413 } 414 415 /** 416 * Joins the given CSV {@code columns} to one CSV line. The default values 417 * for the separator and quote character are used ({@link Separator#DEFAULT}, 418 * {@link Quote#DEFAULT}) for joining the columns. 419 * 420 * @see #join(Iterable) 421 * 422 * @param columns the CSV columns to join 423 * @return the CSV line, joined from the given {@code columns} 424 * @throws NullPointerException if the given {@code columns} is {@code null} 425 */ 426 public static String join(final Object[] columns) { 427 return ColumnJoiner.DEFAULT.join(columns); 428 } 429 430 /** 431 * Joins the given CSV {@code columns} to one CSV line. The default values 432 * for the separator and quote character are used ({@link Separator#DEFAULT}, 433 * {@link Quote#DEFAULT}) for joining the columns. 434 * 435 * @see #join(Iterable) 436 * @see #join(Object[]) 437 * 438 * @param columns the CSV columns to join 439 * @return the CSV line, joined from the given {@code columns} 440 * @throws NullPointerException if the given {@code columns} is {@code null} 441 */ 442 public static String join(final String... columns) { 443 return ColumnJoiner.DEFAULT.join(columns); 444 } 445 446 /** 447 * Converts the given {@code record} into its components. 448 * 449 * @param record the record to convert 450 * @return the record components 451 */ 452 public static Object[] toComponents(final Record record) { 453 try { 454 final var components = record.getClass().getRecordComponents(); 455 final var elements = new Object[components.length]; 456 for (int i = 0; i < elements.length; ++i) { 457 elements[i] = components[i].getAccessor().invoke(record); 458 } 459 460 return elements; 461 } catch (ReflectiveOperationException e) { 462 throw new IllegalArgumentException(e); 463 } 464 } 465 466 /** 467 * Return a collector for joining a list of CSV rows into one CSV string. 468 * 469 * @return a collector for joining a list of CSV rows into one CSV string 470 */ 471 public static Collector<CharSequence, ?, String> toCsv() { 472 return toCsv(EOL); 473 } 474 475 /** 476 * Return a collector for joining a list of CSV rows into one CSV string. 477 * For the line breaks, the given {@code eol} sequence is used. 478 * 479 * @param eol the end of line sequence used for line breaks 480 * @return a collector for joining a list of CSV rows into one CSV string 481 */ 482 public static Collector<CharSequence, ?, String> toCsv(String eol) { 483 if (eol.isEmpty()) { 484 throw new IllegalArgumentException("EOL must not be empty."); 485 } 486 for (int i = 0; i < eol.length(); ++i) { 487 if (!isLineBreak(eol.charAt(i))) { 488 throw new IllegalArgumentException( 489 "EOl contains non-linebreak char: '%s'.".formatted(eol) 490 ); 491 } 492 } 493 494 return Collectors.joining(eol, "", eol); 495 } 496 497 498 /* ************************************************************************* 499 * Base CSV classes. 500 * ************************************************************************/ 501 502 /** 503 * This class reads CSV files and splits it into lines. It takes a quote 504 * character as a parameter, which is necessary for not splitting on quoted 505 * line feeds. 506 * {@snippet lang="java": 507 * final var csv = """ 508 * 0.0,0.0000 509 * 0.1,0.0740 510 * 0.2,0.1120 511 * 0.3,0.1380 512 * 0.4,0.1760 513 * 0.5,0.2500 514 * 0.6,0.3840 515 * 0.7,0.6020 516 * 0.8,0.9280 517 * 0.9,1.3860 518 * 1.0,2.0000 519 * """; 520 * 521 * final var reader = new LineReader(new Quote('"')); 522 * try (Stream<String> lines = reader.read(CharBuffer.wrap(csv))) { 523 * lines.forEach(System.out::println); 524 * } 525 * } 526 * 527 * @apiNote 528 * This reader obeys <em>escaped</em> line breaks according 529 * <a href="https://tools.ietf.org/html/rfc4180">RFC-4180</a>. It is 530 * thread-safe and can be shared between different reading threads. 531 * 532 * @version 8.1 533 * @since 8.1 534 */ 535 public static final class LineReader { 536 537 private static final LineReader DEFAULT = new LineReader(Quote.DEFAULT); 538 539 private final Quote quote; 540 541 /** 542 * Create a new line-reader with the given {@code quote} character, 543 * which is used in the CSV file which is read. 544 * 545 * @param quote the quoting character 546 * @throws NullPointerException if the {@code quote} character is 547 * {@code null} 548 */ 549 public LineReader(final Quote quote) { 550 this.quote = requireNonNull(quote); 551 } 552 553 /** 554 * Create a new line reader with default quote character {@code '"'} 555 * ({@link Quote#DEFAULT}). 556 */ 557 public LineReader() { 558 this(Quote.DEFAULT); 559 } 560 561 /** 562 * Reads all CSV lines from the given {@code reader}. 563 * 564 * @apiNote 565 * This method must be used within a try-with-resources statement or 566 * similar control structure to ensure that the stream's open file is 567 * closed promptly after the stream's operations have completed. 568 * 569 * @param readable the readable from which to read the CSV content 570 * @return the CSV lines from the file as a {@code Stream} 571 */ 572 public Stream<String> read(final Readable readable) { 573 requireNonNull(readable); 574 575 final IOValue<Stream<String>> result = new IOValue<>(resources -> { 576 final Readable rdr = resources.use( 577 readable, 578 resource -> { 579 if (resource instanceof AutoCloseable closeable) { 580 try { 581 closeable.close(); 582 } catch (IOException | RuntimeException | Error e) { 583 throw e; 584 } catch (Exception e) { 585 throw new IOException(e); 586 } 587 } 588 } 589 ); 590 591 final var source = CharCursor.of(rdr); 592 final var line = new CharAppender(); 593 594 final Supplier<String> nextLine = () -> { 595 line.reset(); 596 try { 597 return nextLine(source, line) ? line.toString() : null; 598 } catch (IOException e) { 599 throw new UncheckedIOException(e); 600 } 601 }; 602 603 return Stream.generate(nextLine) 604 .takeWhile(Objects::nonNull); 605 }); 606 607 return result.get().onClose(() -> 608 result.release(UncheckedIOException::new) 609 ); 610 } 611 612 private boolean nextLine(final CharCursor chars, final CharAppender line) 613 throws IOException 614 { 615 boolean quoted = false; 616 boolean escaped = false; 617 boolean eol = false; 618 619 int next = -2; 620 int i = 0; 621 622 while (next >= 0 || (i = chars.next()) != -1) { 623 final char current = next != -2 ? (char)next : (char)i; 624 next = -2; 625 626 if (isLineBreak(current)) { 627 if (quoted) { 628 line.append(current); 629 } else { 630 eol = true; 631 } 632 } else if (current == quote.value) { 633 if (quoted) { 634 if (!escaped && (next = chars.next()) == quote.value) { 635 escaped = true; 636 } else { 637 if (escaped) { 638 escaped = false; 639 } else { 640 quoted = false; 641 } 642 } 643 } else { 644 quoted = true; 645 } 646 line.append(current); 647 } else { 648 line.append(current); 649 } 650 651 if (eol) { 652 eol = false; 653 if (line.nonEmpty()) { 654 return true; 655 } 656 } 657 } 658 659 if (quoted) { 660 throw new IllegalArgumentException( 661 "Unbalanced quote character: '%s'." 662 .formatted(toString(line)) 663 ); 664 } 665 return line.nonEmpty(); 666 } 667 668 private static String toString(final Object value) { 669 final var line = value.toString(); 670 return line.length() > 15 ? line.substring(0, 15) + "..." : line; 671 } 672 } 673 674 /** 675 * Splitting a CSV line into columns (records). 676 * <h2>Examples</h2> 677 * <b>Simple usage</b> 678 * {@snippet class="Snippets" region="LineSplitterSnippets.simpleSplit"} 679 * 680 * <b>Projecting and re-ordering columns</b> 681 * {@snippet class="Snippets" region="LineSplitterSnippets.projectingSplit"} 682 * 683 * @implNote 684 * The split {@code String[]} array will never contain {@code null} values. 685 * Empty columns will be returned as empty strings. 686 * 687 * @apiNote 688 * A line splitter ist <b>not</b> thread-safe and can't be shared between 689 * different threads. 690 * 691 * @version 8.1 692 * @since 8.1 693 */ 694 public static final class LineSplitter { 695 private final Separator separator; 696 private final Quote quote; 697 698 private final ColumnList columns; 699 private final CharAppender column = new CharAppender(); 700 701 /** 702 * Create a new line splitter with the given parameters. 703 * 704 * @param separator the separator character used by the CSV line to split 705 * @param quote the quote character used by the CSV line to split 706 * @param projection the column indexes which should be part of the split 707 * result 708 * @throws NullPointerException if one of the parameters is {@code null} 709 */ 710 public LineSplitter( 711 final Separator separator, 712 final Quote quote, 713 final ColumnIndexes projection 714 ) { 715 if (separator.value == quote.value) { 716 throw new IllegalArgumentException( 717 "Separator and quote char must be different: %s == %s." 718 .formatted(separator.value, quote.value) 719 ); 720 } 721 722 this.separator = separator; 723 this.quote = quote; 724 this.columns = new ColumnList(projection); 725 } 726 727 /** 728 * Create a new line splitter with the given parameters. 729 * 730 * @param separator the separator character used by the CSV line to split 731 * @param quote the quote character used by the CSV line to split 732 * @throws NullPointerException if one of the parameters is {@code null} 733 */ 734 public LineSplitter(final Separator separator, final Quote quote) { 735 this(separator, quote, ColumnIndexes.ALL); 736 } 737 738 /** 739 * Create a new line splitter with the given parameters. The default 740 * quote character, {@link Quote#DEFAULT}, will be used by the created 741 * splitter. 742 * 743 * @param separator the separator character used by the CSV line to split 744 * @throws NullPointerException if one of the parameters is {@code null} 745 */ 746 public LineSplitter(final Separator separator) { 747 this(separator, Quote.DEFAULT, ColumnIndexes.ALL); 748 } 749 750 /** 751 * Create a new line splitter with the given parameters. The default 752 * separator character, {@link Separator#DEFAULT}, will be used by the 753 * created splitter. 754 * 755 * @param quote the quote character used by the CSV line to split 756 * @throws NullPointerException if one of the parameters is {@code null} 757 */ 758 public LineSplitter(final Quote quote) { 759 this(Separator.DEFAULT, quote, ColumnIndexes.ALL); 760 } 761 762 /** 763 * Create a new line splitter with the given parameters. Only the defined 764 * columns will be part of the split result and the default separator 765 * character, {@link Separator#DEFAULT}, and default quote character, 766 * {@link Quote#DEFAULT}, is used by the created splitter. 767 * 768 * @param projection the column indexes which should be part of the split 769 * result 770 * @throws NullPointerException if one of the parameters is {@code null} 771 */ 772 public LineSplitter(final ColumnIndexes projection) { 773 this(Separator.DEFAULT, Quote.DEFAULT, projection); 774 } 775 776 /** 777 * Create a new line splitter with default values. 778 */ 779 public LineSplitter() { 780 this(Separator.DEFAULT, Quote.DEFAULT, ColumnIndexes.ALL); 781 } 782 783 /** 784 * Splitting the given CSV {@code line} into its columns. 785 * 786 * @implNote 787 * The split {@code String[]} array will never contain {@code null} values. 788 * Empty columns will be returned as empty strings. 789 * 790 * @param line the CSV line to split 791 * @return the split CSV columns 792 * @throws NullPointerException if the CSV {@code line} is {@code null} 793 */ 794 public String[] split(final CharSequence line) { 795 columns.clear(); 796 column.reset(); 797 798 boolean quoted = false; 799 boolean escaped = false; 800 boolean full = false; 801 802 int quoteIndex = 0; 803 804 for (int i = 0, n = line.length(); i < n && !full; ++i) { 805 final int previous = i > 0 ? line.charAt(i - 1) : -1; 806 final char current = line.charAt(i); 807 final int next = i + 1 < line.length() ? line.charAt(i + 1) : -1; 808 809 if (current == quote.value) { 810 if (quoted) { 811 if (!escaped && quote.value == next) { 812 escaped = true; 813 } else { 814 if (escaped) { 815 column.append(quote.value); 816 escaped = false; 817 } else { 818 if (next != -1 && separator.value != next) { 819 throw new IllegalArgumentException(""" 820 Only separator character, '%s', allowed \ 821 after quote, but found '%c': 822 %s 823 """.formatted( 824 separator.value, 825 next, 826 toErrorDesc(line, i + 1) 827 ) 828 ); 829 } 830 831 add(column); 832 full = columns.isFull(); 833 quoted = false; 834 } 835 } 836 } else { 837 if (previous != -1 && separator.value != previous) { 838 throw new IllegalArgumentException(""" 839 Only separator character, '%s', allowed before \ 840 quote, but found '%c': 841 %s 842 """.formatted( 843 separator.value, 844 previous, 845 toErrorDesc(line, Math.max(i - 1, 0)) 846 ) 847 ); 848 } 849 850 quoted = true; 851 quoteIndex = i; 852 } 853 } else if (current == separator.value) { 854 if (quoted) { 855 column.append(current); 856 } else if (separator.value == previous || previous == -1) { 857 add(column); 858 full = columns.isFull(); 859 } 860 } else { 861 // Read till the next token separator. 862 int j = i; 863 char c; 864 while (j < line.length() && !isTokenSeparator(c = line.charAt(j))) { 865 column.append(c); 866 ++j; 867 } 868 if (j != i - 1) { 869 i = j - 1; 870 } 871 872 if (!quoted) { 873 add(column); 874 full = columns.isFull(); 875 } 876 } 877 } 878 879 if (quoted) { 880 throw new IllegalArgumentException(""" 881 Unbalanced quote character. 882 %s 883 """.formatted(toErrorDesc(line, quoteIndex)) 884 ); 885 } 886 if (line.isEmpty() || 887 separator.value == line.charAt(line.length() - 1)) 888 { 889 add(column); 890 } 891 892 return columns.toArray(); 893 } 894 895 private void add(final CharAppender column) { 896 columns.add(column.toString()); 897 column.reset(); 898 } 899 900 private boolean isTokenSeparator(final char c) { 901 return c == separator.value || c == quote.value; 902 } 903 904 private static String toErrorDesc(final CharSequence line, final int pos) { 905 return """ 906 %s 907 %s 908 """.formatted( 909 line.toString().stripTrailing(), 910 " ".repeat(pos) + "^" 911 ); 912 } 913 } 914 915 916 /** 917 * Column collection, which is backed up by a string list. 918 */ 919 static final class ColumnList { 920 private final StringList columns = new StringList(); 921 private final ColumnIndexes projection; 922 923 private int index = 0; 924 private int count = 0; 925 926 ColumnList(final ColumnIndexes projection) { 927 this.projection = requireNonNull(projection); 928 } 929 930 /** 931 * Appends a {@code column} to the column collection. 932 * 933 * @param column the column to add 934 */ 935 void add(String column) { 936 if (!isFull()) { 937 count += set(column, index++); 938 } 939 } 940 941 private int set(String element, int column) { 942 int updated = 0; 943 944 if (projection.values.length == 0) { 945 columns.add(element); 946 ++updated; 947 } else { 948 int pos = -1; 949 while ((pos = indexOf(projection.values, pos + 1, column)) != -1) { 950 for (int i = columns.size(); i <= pos; ++i) { 951 columns.add(null); 952 } 953 columns.set(pos, element); 954 ++updated; 955 } 956 } 957 958 return updated; 959 } 960 961 private static int indexOf(int[] array, int start, int value) { 962 for (int i = start; i < array.length; ++i) { 963 if (array[i] == value) { 964 return i; 965 } 966 } 967 968 return -1; 969 } 970 971 /** 972 * Checks whether another column can be added. 973 * 974 * @return {@code true} if another column can be added to this 975 * collection, {@code false} otherwise 976 */ 977 boolean isFull() { 978 return 979 projection.values.length > 0 && 980 projection.values.length <= count; 981 } 982 983 /** 984 * Removes all columns. 985 */ 986 public void clear() { 987 columns.clear(); 988 index = 0; 989 count = 0; 990 } 991 992 String[] toArray() { 993 for (int i = columns.size(); i < projection.values.length; ++i) { 994 columns.add(null); 995 } 996 return columns.toArray(); 997 } 998 999 } 1000 1001 /** 1002 * This class joins an array of columns into one CSV line. 1003 * 1004 * <h2>Examples</h2> 1005 * <b>Simple usage</b> 1006 * {@snippet class="Snippets" region="ColumnJoinerSnippets.simpleJoin"} 1007 * 1008 * <b>Embedding and re-ordering data</b> 1009 * {@snippet class="Snippets" region="ColumnJoinerSnippets.embedToCsv"} 1010 * 1011 * @apiNote 1012 * The column joiner is <em>thread-safe</em> and can be shared between 1013 * different threads. 1014 * 1015 * @version 8.1 1016 * @since 8.1 1017 */ 1018 public static final class ColumnJoiner { 1019 1020 /** 1021 * Default column joiner, which is using default separator character, 1022 * {@link Separator#DEFAULT}, and default quote character, 1023 * {@link Quote#DEFAULT}. 1024 */ 1025 public static final ColumnJoiner DEFAULT = new ColumnJoiner( 1026 Separator.DEFAULT, 1027 Quote.DEFAULT, 1028 ColumnIndexes.ALL 1029 ); 1030 1031 /** 1032 * The CSV line splitter parameter. 1033 * 1034 * @param separator the column separator char 1035 * @param quote the qute char 1036 * @param embedding the column indices to read. If empty, all split 1037 * columns are used. 1038 */ 1039 private record Param(char separator, char quote, int... embedding) { 1040 1041 private String escape(Object value) { 1042 final var quoteStr = String.valueOf(quote); 1043 1044 if (value == null) { 1045 return ""; 1046 } else { 1047 var stringValue = value.toString(); 1048 var string = stringValue.replace(quoteStr, quoteStr + quoteStr); 1049 1050 if (stringValue.length() != string.length() || mustEscape(string)) { 1051 return quoteStr + string + quoteStr; 1052 } else { 1053 return stringValue; 1054 } 1055 } 1056 } 1057 1058 private boolean mustEscape(CharSequence value) { 1059 for (int i = 0; i < value.length(); ++i) { 1060 final char c = value.charAt(i); 1061 if (c == separator || isLineBreak(c)) { 1062 return true; 1063 } 1064 } 1065 return false; 1066 } 1067 } 1068 1069 private final Param param; 1070 private final int columnCount; 1071 1072 /** 1073 * Create a new column joiner with the given parameters. 1074 * 1075 * @param separator the CSV separator character used by the joiner 1076 * @param quote the CSV quote character used by the joiner 1077 * @param embedding the column indexes to join 1078 * @throws NullPointerException if one of the parameters is {@code null} 1079 */ 1080 public ColumnJoiner( 1081 final Separator separator, 1082 final Quote quote, 1083 final ColumnIndexes embedding 1084 ) { 1085 if (separator.value == quote.value) { 1086 throw new IllegalArgumentException( 1087 "Separator and quote char must be different: %s == %s." 1088 .formatted(separator.value, quote.value) 1089 ); 1090 } 1091 1092 param = new Param(separator.value, quote.value, embedding.values); 1093 columnCount = Math.max(max(param.embedding) + 1, 0); 1094 } 1095 1096 /** 1097 * Create a new column joiner with the given parameters. 1098 * 1099 * @param separator the CSV separator character used by the joiner 1100 * @param quote the CSV quote character used by the joiner 1101 * @throws NullPointerException if one of the parameters is {@code null} 1102 */ 1103 public ColumnJoiner(final Separator separator, final Quote quote) { 1104 this(separator, quote, ColumnIndexes.ALL); 1105 } 1106 1107 /** 1108 * Create a new column joiner with the given parameters. 1109 * 1110 * @param separator the CSV separator character used by the joiner 1111 * @throws NullPointerException if one of the parameters is {@code null} 1112 */ 1113 public ColumnJoiner(final Separator separator) { 1114 this(separator, Quote.DEFAULT, ColumnIndexes.ALL); 1115 } 1116 1117 /** 1118 * Create a new column joiner with the given parameters. 1119 * 1120 * @param separator the CSV separator character used by the joiner 1121 * @param embedding the column indexes to join 1122 * @throws NullPointerException if one of the parameters is {@code null} 1123 */ 1124 public ColumnJoiner(final Separator separator, final ColumnIndexes embedding) { 1125 this(separator, Quote.DEFAULT, embedding); 1126 } 1127 1128 1129 /** 1130 * Create a new column joiner with the given parameters. 1131 * 1132 * @param quote the CSV quote character used by the joiner 1133 * @throws NullPointerException if one of the parameters is {@code null} 1134 */ 1135 public ColumnJoiner(final Quote quote) { 1136 this(Separator.DEFAULT, quote, ColumnIndexes.ALL); 1137 } 1138 1139 /** 1140 * Create a new column joiner with the given <em>embedding</em> column 1141 * indexes. 1142 * 1143 * @param embedding the embedding column indexes 1144 */ 1145 public ColumnJoiner(final ColumnIndexes embedding) { 1146 this(Separator.DEFAULT, Quote.DEFAULT, embedding); 1147 } 1148 1149 /** 1150 * Create a new column joiner with the given parameters. 1151 * 1152 * @param quote the CSV quote character used by the joiner 1153 * @param embedding the column indexes to join 1154 * @throws NullPointerException if one of the parameters is {@code null} 1155 */ 1156 public ColumnJoiner(final Quote quote, final ColumnIndexes embedding) { 1157 this(Separator.DEFAULT, quote, embedding); 1158 } 1159 1160 private static int max(int[] array) { 1161 int max = Integer.MIN_VALUE; 1162 for (int value : array) { 1163 if (value > max) { 1164 max = value; 1165 } 1166 } 1167 return max; 1168 } 1169 1170 /** 1171 * Joins the given CSV {@code columns}, using the given separator and 1172 * quote character. 1173 * 1174 * @param columns the CSV columns to join 1175 * @return the joined CSV columns 1176 */ 1177 public String join(final Iterable<?> columns) { 1178 if (param.embedding.length == 0) { 1179 return join0(columns); 1180 } else { 1181 final var values = new Object[columnCount]; 1182 final var it = columns.iterator(); 1183 int i = 0; 1184 while (it.hasNext() && i < param.embedding.length) { 1185 final var col = it.next(); 1186 final var index = param.embedding[i++]; 1187 if (index >= 0) { 1188 values[index] = col; 1189 } 1190 } 1191 1192 return join0(Arrays.asList(values)); 1193 } 1194 } 1195 1196 private String join0(final Iterable<?> cols) { 1197 final var row = new StringBuilder(); 1198 final var it = cols.iterator(); 1199 while (it.hasNext()) { 1200 final var column = it.next(); 1201 row.append(param.escape(column)); 1202 if (it.hasNext()) { 1203 row.append(param.separator); 1204 } 1205 } 1206 1207 return row.toString(); 1208 } 1209 1210 /** 1211 * Joins the given CSV {@code columns}, using the given separator and 1212 * quote character. 1213 * 1214 * @param columns the CSV columns to join 1215 * @return the joined CSV columns 1216 */ 1217 public String join(final Object[] columns) { 1218 return join(Arrays.asList(columns)); 1219 } 1220 } 1221 1222 /** 1223 * Character source interface. 1224 * 1225 * @since 8.2 1226 * @version 8.2 1227 */ 1228 sealed interface CharCursor { 1229 /** 1230 * Return the next character or -1 if there is no one. 1231 * 1232 * @return the next character or -1 if there is no one 1233 * @throws IOException if reading the next character failed 1234 */ 1235 int next() throws IOException; 1236 1237 /** 1238 * Return the correct kind of {@code CharCursor}, depending on the 1239 * given {@code readable} type 1240 * 1241 * @param readable the character source 1242 * @return a new character cursor 1243 */ 1244 static CharCursor of(final Readable readable) { 1245 return readable instanceof CharBuffer cb 1246 ? new CharBufferCharCursor(cb) 1247 : new ReadableCharCursor(readable); 1248 } 1249 } 1250 1251 /** 1252 * Cursor <em>view</em> on a readable object. 1253 * 1254 * @since 8.2 1255 * @version 8.2 1256 */ 1257 static final class ReadableCharCursor implements CharCursor { 1258 private static final int SIZE = 1024; 1259 private final Readable readable; 1260 private final CharBuffer buffer; 1261 1262 ReadableCharCursor(final Readable readable) { 1263 this.readable = requireNonNull(readable); 1264 this.buffer = CharBuffer.allocate(SIZE).flip(); 1265 } 1266 1267 @Override 1268 public int next() throws IOException { 1269 if (!buffer.hasRemaining()) { 1270 if (!fill()) { 1271 return -1; 1272 } 1273 } 1274 1275 return buffer.get(); 1276 } 1277 1278 private boolean fill() throws IOException { 1279 int n; 1280 int i = 0; 1281 buffer.clear(); 1282 do { 1283 n = readable.read(buffer); 1284 } while (n == 0 && i++ < 1000); // Make sure re-read will terminate. 1285 buffer.flip(); 1286 1287 return n > 0; 1288 } 1289 } 1290 1291 /** 1292 * Cursor <em>view</em> on a character buffer. 1293 * 1294 * @since 8.2 1295 * @version 8.2 1296 */ 1297 static final class CharBufferCharCursor implements CharCursor { 1298 private final CharBuffer buffer; 1299 1300 CharBufferCharCursor(final CharBuffer buffer) { 1301 this.buffer = requireNonNull(buffer); 1302 } 1303 1304 @Override 1305 public int next() { 1306 if (!buffer.hasRemaining()) { 1307 return -1; 1308 } 1309 return buffer.get(); 1310 } 1311 } 1312 1313 /** 1314 * Allows appending chars in bulks to {@link StringBuilder}. 1315 * 1316 * @since 8.2 1317 * @version 8.2 1318 */ 1319 static final class CharAppender { 1320 private static final int SIZE = 32; 1321 1322 private char[] buffer = new char[SIZE]; 1323 private int index = 0; 1324 1325 CharAppender() { 1326 } 1327 1328 boolean nonEmpty() { 1329 return index != 0; 1330 } 1331 1332 void append(final char c) { 1333 if (index == buffer.length) { 1334 increaseSize(buffer.length*2); 1335 } 1336 1337 buffer[index++] = c; 1338 } 1339 1340 @Override 1341 public String toString() { 1342 return String.valueOf(buffer, 0, index); 1343 } 1344 1345 void reset() { 1346 index = 0; 1347 } 1348 1349 private void increaseSize(final int newSize) { 1350 final char[] newBuffer = new char[newSize]; 1351 System.arraycopy(buffer, 0, newBuffer, 0, index); 1352 buffer = newBuffer; 1353 } 1354 } 1355 1356 /** 1357 * Simple growing list of strings. 1358 * 1359 * @since 8.2 1360 * @version 8.2 1361 */ 1362 static final class StringList { 1363 private static final int SIZE = 16; 1364 private String[] elements; 1365 private int size; 1366 1367 StringList() { 1368 size = 0; 1369 elements = new String[SIZE]; 1370 } 1371 1372 public int size() { 1373 return size; 1374 } 1375 1376 public void add(final String value) { 1377 if (size == elements.length) { 1378 increaseSize(elements.length*2); 1379 } 1380 elements[size++] = value; 1381 } 1382 1383 public void set(final int index, final String value) { 1384 elements[index] = value; 1385 } 1386 1387 public void clear() { 1388 size = 0; 1389 } 1390 1391 public String[] toArray() { 1392 final var result = new String[size]; 1393 System.arraycopy(elements, 0, result, 0, size); 1394 return result; 1395 } 1396 1397 private void increaseSize(final int newSize) { 1398 final String[] newElements = new String[newSize]; 1399 System.arraycopy(elements, 0, newElements, 0, size); 1400 elements = newElements; 1401 } 1402 1403 } 1404 1405} 1406 1407