001/* 002 * Java Genetic Algorithm Library (jenetics-8.2.0). 003 * Copyright (c) 2007-2025 Franz Wilhelmstötter 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 * 017 * Author: 018 * Franz Wilhelmstötter (franz.wilhelmstoetter@gmail.com) 019 */ 020package io.jenetics.ext.util; 021 022import static java.util.Objects.requireNonNull; 023 024import java.io.IOException; 025import java.io.UncheckedIOException; 026import java.nio.CharBuffer; 027import java.util.Arrays; 028import java.util.List; 029import java.util.Objects; 030import java.util.function.Function; 031import java.util.function.Supplier; 032import java.util.stream.Collector; 033import java.util.stream.Collectors; 034import java.util.stream.Stream; 035 036import io.jenetics.internal.util.Lifecycle.IOValue; 037 038/** 039 * This class contains helper classes, which are the building blocks for handling 040 * CSV files. 041 * <ul> 042 * <li>{@link LineReader}: This class allows you to read the lines of a 043 * CSV file. The result will be a {@link Stream} of CSV lines and are 044 * not split.</li> 045 * <li>{@link LineSplitter}: This class is responsible for splitting one 046 * CSV line into column values.</li> 047 * <li>{@link ColumnIndexes}: Allows to define the projection/embedding of 048 * the split/joined column values.</li> 049 * <li>{@link ColumnJoiner}: Joining a column array into a CSV line, which 050 * can be joined into a whole CSV string.</li> 051 * </ul> 052 * <p> 053 * Additionally, this class contains a set of helper methods for CSV handling 054 * using default configurations. 055 * <p> 056 * <b>Reading and splitting CSV lines</b> 057 * {@snippet class="Snippets" region="readRows"} 058 * <p> 059 * <b>Joining columns and creating CSV string</b> 060 * {@snippet class="Snippets" region="CsvSupportSnippets.collect"} 061 * <p> 062 * <b>Parsing CSV string</b> 063 * {@snippet class="Snippets" region="parseCsv"} 064 * <p> 065 * <b>Parsing double values, given as CSV string</b> 066 * <p> 067 * Another example is to parse double values, which are given as CSV string and 068 * use this data for running a regression analysis. 069 * {@snippet class="Snippets" region="DoublesParsingSnippets.parseDoubles"} 070 * 071 * @see <a href="https://tools.ietf.org/html/rfc4180">RFC-4180</a> 072 * 073 * @author <a href="mailto:franz.wilhelmstoetter@gmail.com">Franz Wilhelmstötter</a> 074 * @version 8.2 075 * @since 8.1 076 */ 077public final class CsvSupport { 078 079 /** 080 * Holds the CSV column <em>separator</em> character. 081 * 082 * @param value the separator character 083 * 084 * @version 8.1 085 * @since 8.1 086 */ 087 public record Separator(char value) { 088 089 /** 090 * The default separator character, '{@code ,}'. 091 */ 092 public static final Separator DEFAULT = new Separator(','); 093 094 /** 095 * Creates a new Separator char object. 096 * 097 * @param value the separator character 098 * @throws IllegalArgumentException if the given separator character is 099 * a line break character 100 */ 101 public Separator { 102 if (isLineBreak(value)) { 103 throw new IllegalArgumentException( 104 "Given separator char is a line break character." 105 ); 106 } 107 } 108 } 109 110 /** 111 * Holds the CSV column <em>quote</em> character. The following excerpt from 112 * <a href="https://tools.ietf.org/html/rfc4180">RFC-4180</a> defines when 113 * a quote character has to be used. 114 * <pre> 115 * 5. Each field may or may not be enclosed in double quotes (however 116 * some programs, such as Microsoft Excel, do not use double quotes 117 * at all). If fields are not enclosed with double quotes, then 118 * double quotes may not appear inside the fields. For example: 119 * 120 * "aaa","bbb","ccc" CRLF 121 * zzz,yyy,xxx 122 * 123 * 6. Fields containing line breaks (CRLF), double quotes, and commas 124 * should be enclosed in double-quotes. For example: 125 * 126 * "aaa","b CRLF 127 * bb","ccc" CRLF 128 * zzz,yyy,xxx 129 * 130 * 7. If double-quotes are used to enclose fields, then a double-quote 131 * appearing inside a field must be escaped by preceding it with 132 * another double quote. For example: 133 * 134 * "aaa","b""bb","ccc" 135 * </pre> 136 * 137 * @param value the quote character 138 * 139 * @version 8.1 140 * @since 8.1 141 */ 142 public record Quote(char value) { 143 144 /** 145 * The default quote character, '{@code "}'. 146 */ 147 public static final Quote DEFAULT = new Quote('"'); 148 149 /** 150 * The zero '\0' character. 151 */ 152 public static final Quote ZERO = new Quote('\0'); 153 154 /** 155 * Creates a new Quote char object. 156 * 157 * @param value the quote character 158 * @throws IllegalArgumentException if the given quote character is 159 * a line break character 160 */ 161 public Quote { 162 if (isLineBreak(value)) { 163 throw new IllegalArgumentException( 164 "Given quote char is a line break character." 165 ); 166 } 167 } 168 } 169 170 /** 171 * Holds the column indexes, which should be part of the split or join 172 * operation. When used in the {@link LineSplitter}, it lets you filter the 173 * split column and define its order. When used in the {@link ColumnJoiner}, 174 * it can be used to define the column index in the resulting CSV for a 175 * given row array. 176 * 177 * @apiNote 178 * The column indexes is <em>thread-safe</em> and can be shared between 179 * different threads. 180 * 181 * @see LineSplitter 182 * @see ColumnJoiner 183 * 184 * @param values the column indexes which are part of the split result 185 * 186 * @version 8.1 187 * @since 8.1 188 */ 189 public record ColumnIndexes(int... values) { 190 191 /** 192 * Indicating that <em>all</em> columns should be part of the split 193 * result. 194 */ 195 public static final ColumnIndexes ALL = new ColumnIndexes(); 196 197 /** 198 * Create a new column indexes object. 199 * 200 * @param values the column indexes 201 */ 202 public ColumnIndexes { 203 values = values.clone(); 204 } 205 206 @Override 207 public int[] values() { 208 return values.clone(); 209 } 210 211 @Override 212 public int hashCode() { 213 return Arrays.hashCode(values); 214 } 215 216 @Override 217 public boolean equals(final Object obj) { 218 return obj == this || 219 obj instanceof ColumnIndexes ci && 220 Arrays.equals(values, ci.values); 221 } 222 223 @Override 224 public String toString() { 225 return Arrays.toString(values); 226 } 227 } 228 229 /** 230 * The newline string used for writing the CSV file: {@code \r\n}. 231 */ 232 public static final String EOL = "\r\n"; 233 234 235 private CsvSupport() { 236 } 237 238 private static boolean isLineBreak(final char c) { 239 return switch (c) { 240 case '\n', '\r' -> true; 241 default -> false; 242 }; 243 } 244 245 /** 246 * Splits the CSV file, given by the {@code reader}, into a {@link Stream} 247 * of CSV lines. The CSV is split at line breaks, as long as they are not 248 * part of a quoted column. For reading the CSV lines, the default quote 249 * character, {@link Quote#DEFAULT}, is used. 250 * 251 * @apiNote 252 * The returned stream must be closed by the caller, which also closes the 253 * CSV {@code reader}. 254 * 255 * @see #readAllLines(Readable) 256 * 257 * @param reader the CSV source reader. The reader is automatically closed 258 * when the returned line stream is closed. 259 * @return the stream of CSV lines 260 * @throws NullPointerException if the given {@code reader} is {@code null} 261 */ 262 public static Stream<String> lines(final Readable reader) { 263 return LineReader.DEFAULT.read(reader); 264 } 265 266 /** 267 * Splits the CSV file, given by the {@code reader}, into a {@code Stream} 268 * of CSV rows. The CSV is split at line breaks, as long as they are not 269 * part of a quoted column. For reading the CSV lines, the default quote 270 * character, {@link Quote#DEFAULT}, is used. Then each line is split into 271 * its columns using the default separator character. 272 * 273 * @apiNote 274 * The returned stream must be closed by the caller, which also closes the 275 * CSV {@code reader}. 276 * 277 * @see #readAllRows(Readable) 278 * 279 * @param reader the CSV source reader. The reader is automatically closed 280 * when the returned line stream is closed. 281 * @return the stream of CSV rows 282 * @throws NullPointerException if the given {@code reader} is {@code null} 283 */ 284 public static Stream<String[]> rows(final Readable reader) { 285 final var splitter = new LineSplitter(); 286 return lines(reader).map(splitter::split); 287 } 288 289 /** 290 * Splits the CSV file, given by the {@code reader}, into a {@code List} 291 * of CSV lines. The CSV is split at line breaks, as long as they are not 292 * part of a quoted column. For reading the CSV lines, the default quote 293 * character, {@link Quote#DEFAULT}, is used. 294 * 295 * @see #lines(Readable) 296 * 297 * @param reader the reader stream to split into CSV lines 298 * @return the list of CSV lines 299 * @throws NullPointerException if the given {@code reader} is {@code null} 300 * @throws IOException if reading the CSV lines fails 301 */ 302 public static List<String> readAllLines(final Readable reader) 303 throws IOException 304 { 305 try (var lines = lines(reader)) { 306 return lines.toList(); 307 } catch (UncheckedIOException e) { 308 throw e.getCause(); 309 } 310 } 311 312 /** 313 * Splits the CSV file, given by the {@code reader}, into a {@code List} 314 * of CSV lines. The CSV is split at line breaks, as long as they are not 315 * part of a quoted column. For reading the CSV lines, the default quote 316 * character, {@link Quote#DEFAULT}, is used. Then each line is split into 317 * its columns using the default separator character. 318 * 319 * @see #rows(Readable) 320 * 321 * @param reader the reader stream to split into CSV lines 322 * @return the list of CSV rows 323 * @throws NullPointerException if the given {@code reader} is {@code null} 324 * @throws IOException if reading the CSV lines fails 325 */ 326 public static List<String[]> readAllRows(final Readable reader) 327 throws IOException 328 { 329 try (var rows = rows(reader)) { 330 return rows.toList(); 331 } catch (UncheckedIOException e) { 332 throw e.getCause(); 333 } 334 } 335 336 /** 337 * Parses the given CSV string into a list of <em>records</em>. The records 338 * are created from a <em>row</em> ({@code String[]} array) by applying the 339 * given {@code mapper}. 340 * 341 * @param csv the CSV string to parse 342 * @param mapper the record mapper 343 * @return the parsed record list 344 * @param <T> the record type 345 */ 346 public static <T> List<T> parse( 347 final CharSequence csv, 348 final Function<? super String[], ? extends T> mapper 349 ) { 350 requireNonNull(csv); 351 requireNonNull(mapper); 352 353 try (var rows = rows(CharBuffer.wrap(csv))) { 354 return rows 355 .map(mapper) 356 .collect(Collectors.toUnmodifiableList()); 357 } 358 } 359 360 /** 361 * Parses the given CSV string into a list of rows. 362 * 363 * @param csv the CSV string to parse 364 * @return the parsed CSV rows 365 */ 366 public static List<String[]> parse(final CharSequence csv) { 367 return parse(csv, Function.identity()); 368 } 369 370 /** 371 * Parses the given CSV string into a list of {@code double[]} array rows. 372 * 373 * @param csv the CSV string to parse 374 * @return the parsed double data 375 */ 376 public static List<double[]> parseDoubles(final CharSequence csv) { 377 return parse(csv, CsvSupport::toDoubles); 378 } 379 380 private static double[] toDoubles(final String[] values) { 381 final var result = new double[values.length]; 382 for (int i = 0; i < result.length; ++i) { 383 result[i] = Double.parseDouble(values[i].trim()); 384 } 385 return result; 386 } 387 388 /** 389 * Splits a given CSV {@code line} into columns. The default values for the 390 * separator and quote character are used ({@link Separator#DEFAULT}, 391 * {@link Quote#DEFAULT}) for splitting the line. 392 * 393 * @param line the CSV line to split 394 * @return the split CSV lines 395 * @throws NullPointerException if the given {@code line} is {@code null} 396 */ 397 public static String[] split(final CharSequence line) { 398 return new LineSplitter().split(line); 399 } 400 401 /** 402 * Joins the given CSV {@code columns} to one CSV line. The default values 403 * for the separator and quote character are used ({@link Separator#DEFAULT}, 404 * {@link Quote#DEFAULT}) for joining the columns. 405 * 406 * @see #join(Object[]) 407 * 408 * @param columns the CSV columns to join 409 * @return the CSV line, joined from the given {@code columns} 410 * @throws NullPointerException if the given {@code columns} is {@code null} 411 */ 412 public static String join(final Iterable<?> columns) { 413 return ColumnJoiner.DEFAULT.join(columns); 414 } 415 416 /** 417 * Joins the given CSV {@code columns} to one CSV line. The default values 418 * for the separator and quote character are used ({@link Separator#DEFAULT}, 419 * {@link Quote#DEFAULT}) for joining the columns. 420 * 421 * @see #join(Iterable) 422 * 423 * @param columns the CSV columns to join 424 * @return the CSV line, joined from the given {@code columns} 425 * @throws NullPointerException if the given {@code columns} is {@code null} 426 */ 427 public static String join(final Object[] columns) { 428 return ColumnJoiner.DEFAULT.join(columns); 429 } 430 431 /** 432 * Joins the given CSV {@code columns} to one CSV line. The default values 433 * for the separator and quote character are used ({@link Separator#DEFAULT}, 434 * {@link Quote#DEFAULT}) for joining the columns. 435 * 436 * @see #join(Iterable) 437 * @see #join(Object[]) 438 * 439 * @param columns the CSV columns to join 440 * @return the CSV line, joined from the given {@code columns} 441 * @throws NullPointerException if the given {@code columns} is {@code null} 442 */ 443 public static String join(final String... columns) { 444 return ColumnJoiner.DEFAULT.join(columns); 445 } 446 447 /** 448 * Converts the given {@code record} into its components. 449 * 450 * @param record the record to convert 451 * @return the record components 452 */ 453 public static Object[] toComponents(final Record record) { 454 try { 455 final var components = record.getClass().getRecordComponents(); 456 final var elements = new Object[components.length]; 457 for (int i = 0; i < elements.length; ++i) { 458 elements[i] = components[i].getAccessor().invoke(record); 459 } 460 461 return elements; 462 } catch (ReflectiveOperationException e) { 463 throw new IllegalArgumentException(e); 464 } 465 } 466 467 /** 468 * Return a collector for joining a list of CSV rows into one CSV string. 469 * 470 * @return a collector for joining a list of CSV rows into one CSV string 471 */ 472 public static Collector<CharSequence, ?, String> toCsv() { 473 return toCsv(EOL); 474 } 475 476 /** 477 * Return a collector for joining a list of CSV rows into one CSV string. 478 * For the line breaks, the given {@code eol} sequence is used. 479 * 480 * @param eol the end of line sequence used for line breaks 481 * @return a collector for joining a list of CSV rows into one CSV string 482 */ 483 public static Collector<CharSequence, ?, String> toCsv(String eol) { 484 if (eol.isEmpty()) { 485 throw new IllegalArgumentException("EOL must not be empty."); 486 } 487 for (int i = 0; i < eol.length(); ++i) { 488 if (!isLineBreak(eol.charAt(i))) { 489 throw new IllegalArgumentException( 490 "EOl contains non-linebreak char: '%s'.".formatted(eol) 491 ); 492 } 493 } 494 495 return Collectors.joining(eol, "", eol); 496 } 497 498 499 /* ************************************************************************* 500 * Base CSV classes. 501 * ************************************************************************/ 502 503 /** 504 * This class reads CSV files and splits it into lines. It takes a quote 505 * character as a parameter, which is necessary for not splitting on quoted 506 * line feeds. 507 * {@snippet lang="java": 508 * final var csv = """ 509 * 0.0,0.0000 510 * 0.1,0.0740 511 * 0.2,0.1120 512 * 0.3,0.1380 513 * 0.4,0.1760 514 * 0.5,0.2500 515 * 0.6,0.3840 516 * 0.7,0.6020 517 * 0.8,0.9280 518 * 0.9,1.3860 519 * 1.0,2.0000 520 * """; 521 * 522 * final var reader = new LineReader(new Quote('"')); 523 * try (Stream<String> lines = reader.read(CharBuffer.wrap(csv))) { 524 * lines.forEach(System.out::println); 525 * } 526 * } 527 * 528 * @apiNote 529 * This reader obeys <em>escaped</em> line breaks according 530 * <a href="https://tools.ietf.org/html/rfc4180">RFC-4180</a>. It is 531 * thread-safe and can be shared between different reading threads. 532 * 533 * @version 8.1 534 * @since 8.1 535 */ 536 public static final class LineReader { 537 538 private static final LineReader DEFAULT = new LineReader(Quote.DEFAULT); 539 540 private final Quote quote; 541 542 /** 543 * Create a new line-reader with the given {@code quote} character, 544 * which is used in the CSV file which is read. 545 * 546 * @param quote the quoting character 547 * @throws NullPointerException if the {@code quote} character is 548 * {@code null} 549 */ 550 public LineReader(final Quote quote) { 551 this.quote = requireNonNull(quote); 552 } 553 554 /** 555 * Create a new line reader with default quote character {@code '"'} 556 * ({@link Quote#DEFAULT}). 557 */ 558 public LineReader() { 559 this(Quote.DEFAULT); 560 } 561 562 /** 563 * Reads all CSV lines from the given {@code reader}. 564 * 565 * @apiNote 566 * This method must be used within a try-with-resources statement or 567 * similar control structure to ensure that the stream's open file is 568 * closed promptly after the stream's operations have completed. 569 * 570 * @param readable the readable from which to read the CSV content 571 * @return the CSV lines from the file as a {@code Stream} 572 */ 573 public Stream<String> read(final Readable readable) { 574 requireNonNull(readable); 575 576 final IOValue<Stream<String>> result = new IOValue<>(resources -> { 577 final Readable rdr = resources.use( 578 readable, 579 resource -> { 580 if (resource instanceof AutoCloseable closeable) { 581 try { 582 closeable.close(); 583 } catch (IOException | RuntimeException | Error e) { 584 throw e; 585 } catch (Exception e) { 586 throw new IOException(e); 587 } 588 } 589 } 590 ); 591 592 final var source = CharCursor.of(rdr); 593 final var line = new CharAppender(); 594 595 final Supplier<String> nextLine = () -> { 596 line.reset(); 597 try { 598 return nextLine(source, line) ? line.toString() : null; 599 } catch (IOException e) { 600 throw new UncheckedIOException(e); 601 } 602 }; 603 604 return Stream.generate(nextLine) 605 .takeWhile(Objects::nonNull); 606 }); 607 608 return result.get().onClose(() -> 609 result.release(UncheckedIOException::new) 610 ); 611 } 612 613 private boolean nextLine(final CharCursor chars, final CharAppender line) 614 throws IOException 615 { 616 boolean quoted = false; 617 boolean escaped = false; 618 boolean eol = false; 619 620 int next = -2; 621 int i = 0; 622 623 while (next >= 0 || (i = chars.next()) != -1) { 624 final char current = next != -2 ? (char)next : (char)i; 625 next = -2; 626 627 if (isLineBreak(current)) { 628 if (quoted) { 629 line.append(current); 630 } else { 631 eol = true; 632 } 633 } else if (current == quote.value) { 634 if (quoted) { 635 if (!escaped && (next = chars.next()) == quote.value) { 636 escaped = true; 637 } else { 638 if (escaped) { 639 escaped = false; 640 } else { 641 quoted = false; 642 } 643 } 644 } else { 645 quoted = true; 646 } 647 line.append(current); 648 } else { 649 line.append(current); 650 } 651 652 if (eol) { 653 eol = false; 654 if (line.nonEmpty()) { 655 return true; 656 } 657 } 658 } 659 660 if (quoted) { 661 throw new IllegalArgumentException( 662 "Unbalanced quote character: '%s'." 663 .formatted(toString(line)) 664 ); 665 } 666 return line.nonEmpty(); 667 } 668 669 private static String toString(final Object value) { 670 final var line = value.toString(); 671 return line.length() > 15 ? line.substring(0, 15) + "..." : line; 672 } 673 } 674 675 /** 676 * Splitting a CSV line into columns (records). 677 * <h2>Examples</h2> 678 * <b>Simple usage</b> 679 * {@snippet class="Snippets" region="LineSplitterSnippets.simpleSplit"} 680 * 681 * <b>Projecting and re-ordering columns</b> 682 * {@snippet class="Snippets" region="LineSplitterSnippets.projectingSplit"} 683 * 684 * @implNote 685 * The split {@code String[]} array will never contain {@code null} values. 686 * Empty columns will be returned as empty strings. 687 * 688 * @apiNote 689 * A line splitter ist <b>not</b> thread-safe and can't be shared between 690 * different threads. 691 * 692 * @version 8.1 693 * @since 8.1 694 */ 695 public static final class LineSplitter { 696 private final Separator separator; 697 private final Quote quote; 698 699 private final ColumnList columns; 700 private final CharAppender column = new CharAppender(); 701 702 /** 703 * Create a new line splitter with the given parameters. 704 * 705 * @param separator the separator character used by the CSV line to split 706 * @param quote the quote character used by the CSV line to split 707 * @param projection the column indexes which should be part of the split 708 * result 709 * @throws NullPointerException if one of the parameters is {@code null} 710 */ 711 public LineSplitter( 712 final Separator separator, 713 final Quote quote, 714 final ColumnIndexes projection 715 ) { 716 if (separator.value == quote.value) { 717 throw new IllegalArgumentException( 718 "Separator and quote char must be different: %s == %s." 719 .formatted(separator.value, quote.value) 720 ); 721 } 722 723 this.separator = separator; 724 this.quote = quote; 725 this.columns = new ColumnList(projection); 726 } 727 728 /** 729 * Create a new line splitter with the given parameters. 730 * 731 * @param separator the separator character used by the CSV line to split 732 * @param quote the quote character used by the CSV line to split 733 * @throws NullPointerException if one of the parameters is {@code null} 734 */ 735 public LineSplitter(final Separator separator, final Quote quote) { 736 this(separator, quote, ColumnIndexes.ALL); 737 } 738 739 /** 740 * Create a new line splitter with the given parameters. The default 741 * quote character, {@link Quote#DEFAULT}, will be used by the created 742 * splitter. 743 * 744 * @param separator the separator character used by the CSV line to split 745 * @throws NullPointerException if one of the parameters is {@code null} 746 */ 747 public LineSplitter(final Separator separator) { 748 this(separator, Quote.DEFAULT, ColumnIndexes.ALL); 749 } 750 751 /** 752 * Create a new line splitter with the given parameters. The default 753 * separator character, {@link Separator#DEFAULT}, will be used by the 754 * created splitter. 755 * 756 * @param quote the quote character used by the CSV line to split 757 * @throws NullPointerException if one of the parameters is {@code null} 758 */ 759 public LineSplitter(final Quote quote) { 760 this(Separator.DEFAULT, quote, ColumnIndexes.ALL); 761 } 762 763 /** 764 * Create a new line splitter with the given parameters. Only the defined 765 * columns will be part of the split result and the default separator 766 * character, {@link Separator#DEFAULT}, and default quote character, 767 * {@link Quote#DEFAULT}, is used by the created splitter. 768 * 769 * @param projection the column indexes which should be part of the split 770 * result 771 * @throws NullPointerException if one of the parameters is {@code null} 772 */ 773 public LineSplitter(final ColumnIndexes projection) { 774 this(Separator.DEFAULT, Quote.DEFAULT, projection); 775 } 776 777 /** 778 * Create a new line splitter with default values. 779 */ 780 public LineSplitter() { 781 this(Separator.DEFAULT, Quote.DEFAULT, ColumnIndexes.ALL); 782 } 783 784 /** 785 * Splitting the given CSV {@code line} into its columns. 786 * 787 * @implNote 788 * The split {@code String[]} array will never contain {@code null} values. 789 * Empty columns will be returned as empty strings. 790 * 791 * @param line the CSV line to split 792 * @return the split CSV columns 793 * @throws NullPointerException if the CSV {@code line} is {@code null} 794 */ 795 public String[] split(final CharSequence line) { 796 columns.clear(); 797 column.reset(); 798 799 boolean quoted = false; 800 boolean escaped = false; 801 boolean full = false; 802 803 int quoteIndex = 0; 804 805 for (int i = 0, n = line.length(); i < n && !full; ++i) { 806 final int previous = i > 0 ? line.charAt(i - 1) : -1; 807 final char current = line.charAt(i); 808 final int next = i + 1 < line.length() ? line.charAt(i + 1) : -1; 809 810 if (current == quote.value) { 811 if (quoted) { 812 if (!escaped && quote.value == next) { 813 escaped = true; 814 } else { 815 if (escaped) { 816 column.append(quote.value); 817 escaped = false; 818 } else { 819 if (next != -1 && separator.value != next) { 820 throw new IllegalArgumentException(""" 821 Only separator character, '%s', allowed \ 822 after quote, but found '%c': 823 %s 824 """.formatted( 825 separator.value, 826 next, 827 toErrorDesc(line, i + 1) 828 ) 829 ); 830 } 831 832 add(column); 833 full = columns.isFull(); 834 quoted = false; 835 } 836 } 837 } else { 838 if (previous != -1 && separator.value != previous) { 839 throw new IllegalArgumentException(""" 840 Only separator character, '%s', allowed before \ 841 quote, but found '%c': 842 %s 843 """.formatted( 844 separator.value, 845 previous, 846 toErrorDesc(line, Math.max(i - 1, 0)) 847 ) 848 ); 849 } 850 851 quoted = true; 852 quoteIndex = i; 853 } 854 } else if (current == separator.value) { 855 if (quoted) { 856 column.append(current); 857 } else if (separator.value == previous || previous == -1) { 858 add(column); 859 full = columns.isFull(); 860 } 861 } else { 862 // Read till the next token separator. 863 int j = i; 864 char c; 865 while (j < line.length() && !isTokenSeparator(c = line.charAt(j))) { 866 column.append(c); 867 ++j; 868 } 869 if (j != i - 1) { 870 i = j - 1; 871 } 872 873 if (!quoted) { 874 add(column); 875 full = columns.isFull(); 876 } 877 } 878 } 879 880 if (quoted) { 881 throw new IllegalArgumentException(""" 882 Unbalanced quote character. 883 %s 884 """.formatted(toErrorDesc(line, quoteIndex)) 885 ); 886 } 887 if (line.isEmpty() || 888 separator.value == line.charAt(line.length() - 1)) 889 { 890 add(column); 891 } 892 893 return columns.toArray(); 894 } 895 896 private void add(final CharAppender column) { 897 columns.add(column.toString()); 898 column.reset(); 899 } 900 901 private boolean isTokenSeparator(final char c) { 902 return c == separator.value || c == quote.value; 903 } 904 905 private static String toErrorDesc(final CharSequence line, final int pos) { 906 return """ 907 %s 908 %s 909 """.formatted( 910 line.toString().stripTrailing(), 911 " ".repeat(pos) + "^" 912 ); 913 } 914 } 915 916 917 /** 918 * Column collection, which is backed up by a string list. 919 */ 920 static final class ColumnList { 921 private final StringList columns = new StringList(); 922 private final ColumnIndexes projection; 923 924 private int index = 0; 925 private int count = 0; 926 927 ColumnList(final ColumnIndexes projection) { 928 this.projection = requireNonNull(projection); 929 } 930 931 /** 932 * Appends a {@code column} to the column collection. 933 * 934 * @param column the column to add 935 */ 936 void add(String column) { 937 if (!isFull()) { 938 count += set(column, index++); 939 } 940 } 941 942 private int set(String element, int column) { 943 int updated = 0; 944 945 if (projection.values.length == 0) { 946 columns.add(element); 947 ++updated; 948 } else { 949 int pos = -1; 950 while ((pos = indexOf(projection.values, pos + 1, column)) != -1) { 951 for (int i = columns.size(); i <= pos; ++i) { 952 columns.add(null); 953 } 954 columns.set(pos, element); 955 ++updated; 956 } 957 } 958 959 return updated; 960 } 961 962 private static int indexOf(int[] array, int start, int value) { 963 for (int i = start; i < array.length; ++i) { 964 if (array[i] == value) { 965 return i; 966 } 967 } 968 969 return -1; 970 } 971 972 /** 973 * Checks whether another column can be added. 974 * 975 * @return {@code true} if another column can be added to this 976 * collection, {@code false} otherwise 977 */ 978 boolean isFull() { 979 return 980 projection.values.length > 0 && 981 projection.values.length <= count; 982 } 983 984 /** 985 * Removes all columns. 986 */ 987 public void clear() { 988 columns.clear(); 989 index = 0; 990 count = 0; 991 } 992 993 String[] toArray() { 994 for (int i = columns.size(); i < projection.values.length; ++i) { 995 columns.add(null); 996 } 997 return columns.toArray(); 998 } 999 1000 } 1001 1002 /** 1003 * This class joins an array of columns into one CSV line. 1004 * 1005 * <h2>Examples</h2> 1006 * <b>Simple usage</b> 1007 * {@snippet class="Snippets" region="ColumnJoinerSnippets.simpleJoin"} 1008 * 1009 * <b>Embedding and re-ordering data</b> 1010 * {@snippet class="Snippets" region="ColumnJoinerSnippets.embedToCsv"} 1011 * 1012 * @apiNote 1013 * The column joiner is <em>thread-safe</em> and can be shared between 1014 * different threads. 1015 * 1016 * @version 8.1 1017 * @since 8.1 1018 */ 1019 public static final class ColumnJoiner { 1020 1021 /** 1022 * Default column joiner, which is using default separator character, 1023 * {@link Separator#DEFAULT}, and default quote character, 1024 * {@link Quote#DEFAULT}. 1025 */ 1026 public static final ColumnJoiner DEFAULT = new ColumnJoiner( 1027 Separator.DEFAULT, 1028 Quote.DEFAULT, 1029 ColumnIndexes.ALL 1030 ); 1031 1032 /** 1033 * The CSV line splitter parameter. 1034 * 1035 * @param separator the column separator char 1036 * @param quote the qute char 1037 * @param embedding the column indices to read. If empty, all split 1038 * columns are used. 1039 */ 1040 private record Param(char separator, char quote, int... embedding) { 1041 1042 private String escape(Object value) { 1043 final var quoteStr = String.valueOf(quote); 1044 1045 if (value == null) { 1046 return ""; 1047 } else { 1048 var stringValue = value.toString(); 1049 var string = stringValue.replace(quoteStr, quoteStr + quoteStr); 1050 1051 if (stringValue.length() != string.length() || mustEscape(string)) { 1052 return quoteStr + string + quoteStr; 1053 } else { 1054 return stringValue; 1055 } 1056 } 1057 } 1058 1059 private boolean mustEscape(CharSequence value) { 1060 for (int i = 0; i < value.length(); ++i) { 1061 final char c = value.charAt(i); 1062 if (c == separator || isLineBreak(c)) { 1063 return true; 1064 } 1065 } 1066 return false; 1067 } 1068 } 1069 1070 private final Param param; 1071 private final int columnCount; 1072 1073 /** 1074 * Create a new column joiner with the given parameters. 1075 * 1076 * @param separator the CSV separator character used by the joiner 1077 * @param quote the CSV quote character used by the joiner 1078 * @param embedding the column indexes to join 1079 * @throws NullPointerException if one of the parameters is {@code null} 1080 */ 1081 public ColumnJoiner( 1082 final Separator separator, 1083 final Quote quote, 1084 final ColumnIndexes embedding 1085 ) { 1086 if (separator.value == quote.value) { 1087 throw new IllegalArgumentException( 1088 "Separator and quote char must be different: %s == %s." 1089 .formatted(separator.value, quote.value) 1090 ); 1091 } 1092 1093 param = new Param(separator.value, quote.value, embedding.values); 1094 columnCount = Math.max(max(param.embedding) + 1, 0); 1095 } 1096 1097 /** 1098 * Create a new column joiner with the given parameters. 1099 * 1100 * @param separator the CSV separator character used by the joiner 1101 * @param quote the CSV quote character used by the joiner 1102 * @throws NullPointerException if one of the parameters is {@code null} 1103 */ 1104 public ColumnJoiner(final Separator separator, final Quote quote) { 1105 this(separator, quote, ColumnIndexes.ALL); 1106 } 1107 1108 /** 1109 * Create a new column joiner with the given parameters. 1110 * 1111 * @param separator the CSV separator character used by the joiner 1112 * @throws NullPointerException if one of the parameters is {@code null} 1113 */ 1114 public ColumnJoiner(final Separator separator) { 1115 this(separator, Quote.DEFAULT, ColumnIndexes.ALL); 1116 } 1117 1118 /** 1119 * Create a new column joiner with the given parameters. 1120 * 1121 * @param separator the CSV separator character used by the joiner 1122 * @param embedding the column indexes to join 1123 * @throws NullPointerException if one of the parameters is {@code null} 1124 */ 1125 public ColumnJoiner(final Separator separator, final ColumnIndexes embedding) { 1126 this(separator, Quote.DEFAULT, embedding); 1127 } 1128 1129 1130 /** 1131 * Create a new column joiner with the given parameters. 1132 * 1133 * @param quote the CSV quote character used by the joiner 1134 * @throws NullPointerException if one of the parameters is {@code null} 1135 */ 1136 public ColumnJoiner(final Quote quote) { 1137 this(Separator.DEFAULT, quote, ColumnIndexes.ALL); 1138 } 1139 1140 /** 1141 * Create a new column joiner with the given <em>embedding</em> column 1142 * indexes. 1143 * 1144 * @param embedding the embedding column indexes 1145 */ 1146 public ColumnJoiner(final ColumnIndexes embedding) { 1147 this(Separator.DEFAULT, Quote.DEFAULT, embedding); 1148 } 1149 1150 /** 1151 * Create a new column joiner with the given parameters. 1152 * 1153 * @param quote the CSV quote character used by the joiner 1154 * @param embedding the column indexes to join 1155 * @throws NullPointerException if one of the parameters is {@code null} 1156 */ 1157 public ColumnJoiner(final Quote quote, final ColumnIndexes embedding) { 1158 this(Separator.DEFAULT, quote, embedding); 1159 } 1160 1161 private static int max(int[] array) { 1162 int max = Integer.MIN_VALUE; 1163 for (int value : array) { 1164 if (value > max) { 1165 max = value; 1166 } 1167 } 1168 return max; 1169 } 1170 1171 /** 1172 * Joins the given CSV {@code columns}, using the given separator and 1173 * quote character. 1174 * 1175 * @param columns the CSV columns to join 1176 * @return the joined CSV columns 1177 */ 1178 public String join(final Iterable<?> columns) { 1179 if (param.embedding.length == 0) { 1180 return join0(columns); 1181 } else { 1182 final var values = new Object[columnCount]; 1183 final var it = columns.iterator(); 1184 int i = 0; 1185 while (it.hasNext() && i < param.embedding.length) { 1186 final var col = it.next(); 1187 final var index = param.embedding[i++]; 1188 if (index >= 0) { 1189 values[index] = col; 1190 } 1191 } 1192 1193 return join0(Arrays.asList(values)); 1194 } 1195 } 1196 1197 private String join0(final Iterable<?> cols) { 1198 final var row = new StringBuilder(); 1199 final var it = cols.iterator(); 1200 while (it.hasNext()) { 1201 final var column = it.next(); 1202 row.append(param.escape(column)); 1203 if (it.hasNext()) { 1204 row.append(param.separator); 1205 } 1206 } 1207 1208 return row.toString(); 1209 } 1210 1211 /** 1212 * Joins the given CSV {@code columns}, using the given separator and 1213 * quote character. 1214 * 1215 * @param columns the CSV columns to join 1216 * @return the joined CSV columns 1217 */ 1218 public String join(final Object[] columns) { 1219 return join(Arrays.asList(columns)); 1220 } 1221 } 1222 1223 static final class CharSequenceCursor { 1224 private final CharSequence chars; 1225 1226 int previous; 1227 char current; 1228 int next; 1229 int index = 0; 1230 1231 private CharSequenceCursor(final CharSequence chars) { 1232 this.chars = requireNonNull(chars); 1233 } 1234 1235 boolean hasNext() { 1236 return index < chars.length(); 1237 } 1238 1239 void advance() { 1240 if (index == 0) { 1241 previous = -1; 1242 current = chars.charAt(0); 1243 next = 1 < chars.length() ? chars.charAt(1) : -1; 1244 ++index; 1245 } else { 1246 previous = current; 1247 current = (char)next; 1248 ++index; 1249 next = index < chars.length() ? chars.charAt(index) : -1; 1250 } 1251 } 1252 1253 void set(int i) { 1254 previous = i > 0 ? chars.charAt(i - 1) : -1; 1255 current = chars.charAt(i); 1256 next = i + 1 < chars.length() ? chars.charAt(i + 1) : -1; 1257 index = i + 1; 1258 } 1259 } 1260 1261 /** 1262 * Character source interface. 1263 * 1264 * @since 8.2 1265 * @version 8.2 1266 */ 1267 sealed interface CharCursor { 1268 /** 1269 * Return the next character or -1 if there is no one. 1270 * 1271 * @return the next character or -1 if there is no one 1272 * @throws IOException if reading the next character failed 1273 */ 1274 int next() throws IOException; 1275 1276 /** 1277 * Return the correct kind of {@code CharCursor}, depending on the 1278 * given {@code readable} type 1279 * 1280 * @param readable the character source 1281 * @return a new character cursor 1282 */ 1283 static CharCursor of(final Readable readable) { 1284 return readable instanceof CharBuffer cb 1285 ? new CharBufferCharCursor(cb) 1286 : new ReadableCharCursor(readable); 1287 } 1288 } 1289 1290 /** 1291 * Cursor <em>view</em> on a readable object. 1292 * 1293 * @since 8.2 1294 * @version 8.2 1295 */ 1296 static final class ReadableCharCursor implements CharCursor { 1297 private static final int SIZE = 1024; 1298 private final Readable readable; 1299 private final CharBuffer buffer; 1300 1301 ReadableCharCursor(final Readable readable) { 1302 this.readable = requireNonNull(readable); 1303 this.buffer = CharBuffer.allocate(SIZE).flip(); 1304 } 1305 1306 @Override 1307 public int next() throws IOException { 1308 if (!buffer.hasRemaining()) { 1309 if (!fill()) { 1310 return -1; 1311 } 1312 } 1313 1314 return buffer.get(); 1315 } 1316 1317 private boolean fill() throws IOException { 1318 int n; 1319 int i = 0; 1320 buffer.clear(); 1321 do { 1322 n = readable.read(buffer); 1323 } while (n == 0 && i++ < 1000); // Make sure re-read will terminate. 1324 buffer.flip(); 1325 1326 return n > 0; 1327 } 1328 } 1329 1330 /** 1331 * Cursor <em>view</em> on a character buffer. 1332 * 1333 * @since 8.2 1334 * @version 8.2 1335 */ 1336 static final class CharBufferCharCursor implements CharCursor { 1337 private final CharBuffer buffer; 1338 1339 CharBufferCharCursor(final CharBuffer buffer) { 1340 this.buffer = requireNonNull(buffer); 1341 } 1342 1343 @Override 1344 public int next() { 1345 if (!buffer.hasRemaining()) { 1346 return -1; 1347 } 1348 return buffer.get(); 1349 } 1350 } 1351 1352 /** 1353 * Allows appending chars in bulks to {@link StringBuilder}. 1354 * 1355 * @since 8.2 1356 * @version 8.2 1357 */ 1358 static final class CharAppender { 1359 private static final int SIZE = 32; 1360 1361 private char[] buffer = new char[SIZE]; 1362 private int index = 0; 1363 1364 CharAppender() { 1365 } 1366 1367 boolean nonEmpty() { 1368 return index != 0; 1369 } 1370 1371 void append(final char c) { 1372 if (index == buffer.length) { 1373 increaseSize(buffer.length*2); 1374 } 1375 1376 buffer[index++] = c; 1377 } 1378 1379 @Override 1380 public String toString() { 1381 return String.valueOf(buffer, 0, index); 1382 } 1383 1384 void reset() { 1385 index = 0; 1386 } 1387 1388 private void increaseSize(final int newSize) { 1389 final char[] newBuffer = new char[newSize]; 1390 System.arraycopy(buffer, 0, newBuffer, 0, index); 1391 buffer = newBuffer; 1392 } 1393 } 1394 1395 /** 1396 * Simple growing list of strings. 1397 * 1398 * @since 8.2 1399 * @version 8.2 1400 */ 1401 static final class StringList { 1402 private static final int SIZE = 16; 1403 private String[] elements; 1404 private int size; 1405 1406 StringList() { 1407 size = 0; 1408 elements = new String[SIZE]; 1409 } 1410 1411 public int size() { 1412 return size; 1413 } 1414 1415 public void add(final String value) { 1416 if (size == elements.length) { 1417 increaseSize(elements.length*2); 1418 } 1419 elements[size++] = value; 1420 } 1421 1422 public void set(final int index, final String value) { 1423 elements[index] = value; 1424 } 1425 1426 public void clear() { 1427 size = 0; 1428 } 1429 1430 public String[] toArray() { 1431 final var result = new String[size]; 1432 System.arraycopy(elements, 0, result, 0, size); 1433 return result; 1434 } 1435 1436 private void increaseSize(final int newSize) { 1437 final String[] newElements = new String[newSize]; 1438 System.arraycopy(elements, 0, newElements, 0, size); 1439 elements = newElements; 1440 } 1441 1442 } 1443 1444} 1445 1446