001/* 002 * Java Genetic Algorithm Library (jenetics-8.1.0). 003 * Copyright (c) 2007-2024 Franz Wilhelmstötter 004 * 005 * Licensed under the Apache License, Version 2.0 (the "License"); 006 * you may not use this file except in compliance with the License. 007 * You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 * 017 * Author: 018 * Franz Wilhelmstötter (franz.wilhelmstoetter@gmail.com) 019 */ 020package io.jenetics.ext.util; 021 022import static java.util.Objects.requireNonNull; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026import java.io.Reader; 027import java.io.UncheckedIOException; 028import java.util.ArrayList; 029import java.util.Arrays; 030import java.util.List; 031import java.util.Objects; 032import java.util.function.Function; 033import java.util.function.Supplier; 034import java.util.stream.Collector; 035import java.util.stream.Collectors; 036import java.util.stream.Stream; 037 038import io.jenetics.internal.util.Lifecycle.IOValue; 039 040/** 041 * This class contains helper classes, which are the building blocks for handling 042 * CSV files. 043 * <ul> 044 * <li>{@link LineReader}: This class allows you to read the lines of a 045 * CSV file. The result will be a {@link Stream} of CSV lines and are 046 * not split.</li> 047 * <li>{@link LineSplitter}: This class is responsible for splitting one 048 * CSV line into column values.</li> 049 * <li>{@link ColumnIndexes}: Allows to define the projection/embedding of 050 * the split/joined column values.</li> 051 * <li>{@link ColumnJoiner}: Joining a column array into a CSV line, which 052 * can be joined into a whole CSV string.</li> 053 * </ul> 054 * <p> 055 * Additionally, this class contains a set of helper methods for CSV handling 056 * using default configurations. 057 * <p> 058 * <b>Reading and splitting CSV lines</b> 059 * {@snippet class="Snippets" region="readRows"} 060 * <p> 061 * <b>Joining columns and creating CSV string</b> 062 * {@snippet class="Snippets" region="CsvSupportSnippets.collect"} 063 * <p> 064 * <b>Parsing CSV string</b> 065 * {@snippet class="Snippets" region="parseCsv"} 066 * <p> 067 * <b>Parsing double values, given as CSV string</b> 068 * <p> 069 * Another example is to parse double values, which are given as CSV string and 070 * use this data for running a regression analysis. 071 * {@snippet class="Snippets" region="DoublesParsingSnippets.parseDoubles"} 072 * 073 * @see <a href="https://tools.ietf.org/html/rfc4180">RFC-4180</a> 074 * 075 * @author <a href="mailto:franz.wilhelmstoetter@gmail.com">Franz Wilhelmstötter</a> 076 * @version 8.1 077 * @since 8.1 078 */ 079public final class CsvSupport { 080 081 /** 082 * Holds the CSV column <em>separator</em> character. 083 * 084 * @param value the separator character 085 * 086 * @version 8.1 087 * @since 8.1 088 */ 089 public record Separator(char value) { 090 091 /** 092 * The default separator character, '{@code ,}'. 093 */ 094 public static final Separator DEFAULT = new Separator(','); 095 096 /** 097 * Creates a new Separator char object. 098 * 099 * @param value the separator character 100 * @throws IllegalArgumentException if the given separator character is 101 * a line break character 102 */ 103 public Separator { 104 if (isLineBreak(value)) { 105 throw new IllegalArgumentException( 106 "Given separator char is a line break character." 107 ); 108 } 109 } 110 } 111 112 /** 113 * Holds the CSV column <em>quote</em> character. The following excerpt from 114 * <a href="https://tools.ietf.org/html/rfc4180">RFC-4180</a> defines when 115 * a quote character has to be used. 116 * <pre> 117 * 5. Each field may or may not be enclosed in double quotes (however 118 * some programs, such as Microsoft Excel, do not use double quotes 119 * at all). If fields are not enclosed with double quotes, then 120 * double quotes may not appear inside the fields. For example: 121 * 122 * "aaa","bbb","ccc" CRLF 123 * zzz,yyy,xxx 124 * 125 * 6. Fields containing line breaks (CRLF), double quotes, and commas 126 * should be enclosed in double-quotes. For example: 127 * 128 * "aaa","b CRLF 129 * bb","ccc" CRLF 130 * zzz,yyy,xxx 131 * 132 * 7. If double-quotes are used to enclose fields, then a double-quote 133 * appearing inside a field must be escaped by preceding it with 134 * another double quote. For example: 135 * 136 * "aaa","b""bb","ccc" 137 * </pre> 138 * 139 * @param value the quote character 140 * 141 * @version 8.1 142 * @since 8.1 143 */ 144 public record Quote(char value) { 145 146 /** 147 * The default quote character, '{@code "}'. 148 */ 149 public static final Quote DEFAULT = new Quote('"'); 150 151 /** 152 * The zero '\0' character. 153 */ 154 public static final Quote ZERO = new Quote('\0'); 155 156 /** 157 * Creates a new Quote char object. 158 * 159 * @param value the quote character 160 * @throws IllegalArgumentException if the given quote character is 161 * a line break character 162 */ 163 public Quote { 164 if (isLineBreak(value)) { 165 throw new IllegalArgumentException( 166 "Given quote char is a line break character." 167 ); 168 } 169 } 170 } 171 172 /** 173 * Holds the column indexes, which should be part of the split or join 174 * operation. When used in the {@link LineSplitter}, it lets you filter the 175 * split column and define its order. When used in the {@link ColumnJoiner}, 176 * it can be used to define the column index in the resulting CSV for a 177 * given row array. 178 * 179 * @apiNote 180 * The column indexes is <em>thread-safe</em> and can be shared between 181 * different threads. 182 * 183 * @see LineSplitter 184 * @see ColumnJoiner 185 * 186 * @param values the column indexes which are part of the split result 187 * 188 * @version 8.1 189 * @since 8.1 190 */ 191 public record ColumnIndexes(int... values) { 192 193 /** 194 * Indicating that <em>all</em> columns should be part of the split 195 * result. 196 */ 197 public static final ColumnIndexes ALL = new ColumnIndexes(); 198 199 /** 200 * Create a new column indexes object. 201 * 202 * @param values the column indexes 203 */ 204 public ColumnIndexes { 205 values = values.clone(); 206 } 207 208 @Override 209 public int[] values() { 210 return values.clone(); 211 } 212 213 @Override 214 public int hashCode() { 215 return Arrays.hashCode(values); 216 } 217 218 @Override 219 public boolean equals(final Object obj) { 220 return obj == this || 221 obj instanceof ColumnIndexes ci && 222 Arrays.equals(values, ci.values); 223 } 224 225 @Override 226 public String toString() { 227 return Arrays.toString(values); 228 } 229 } 230 231 /** 232 * The newline string used for writing the CSV file: {@code \r\n}. 233 */ 234 public static final String EOL = "\r\n"; 235 236 237 private CsvSupport() { 238 } 239 240 private static boolean isLineBreak(final char c) { 241 return switch (c) { 242 case '\n', '\r' -> true; 243 default -> false; 244 }; 245 } 246 247 /** 248 * Splits the CSV file, given by the {@code reader}, into a {@link Stream} 249 * of CSV lines. The CSV is split at line breaks, as long as they are not 250 * part of a quoted column. For reading the CSV lines, the default quote 251 * character, {@link Quote#DEFAULT}, is used. 252 * 253 * @apiNote 254 * The returned stream must be closed by the caller, which also closes the 255 * CSV {@code reader}. 256 * 257 * @see #readAllLines(Reader) 258 * 259 * @param reader the CSV source reader. The reader is automatically closed 260 * when the returned line stream is closed. 261 * @return the stream of CSV lines 262 * @throws NullPointerException if the given {@code reader} is {@code null} 263 */ 264 public static Stream<String> lines(final Reader reader) { 265 return LineReader.DEFAULT.read(reader); 266 } 267 268 /** 269 * Splits the CSV file, given by the {@code reader}, into a {@code Stream} 270 * of CSV rows. The CSV is split at line breaks, as long as they are not 271 * part of a quoted column. For reading the CSV lines, the default quote 272 * character, {@link Quote#DEFAULT}, is used. Then each line is split into 273 * its columns using the default separator character. 274 * 275 * @apiNote 276 * The returned stream must be closed by the caller, which also closes the 277 * CSV {@code reader}. 278 * 279 * @see #readAllRows(Reader) 280 * 281 * @param reader the CSV source reader. The reader is automatically closed 282 * when the returned line stream is closed. 283 * @return the stream of CSV rows 284 * @throws NullPointerException if the given {@code reader} is {@code null} 285 */ 286 public static Stream<String[]> rows(final Reader reader) { 287 final var splitter = new LineSplitter(); 288 return lines(reader).map(splitter::split); 289 } 290 291 /** 292 * Splits the CSV file, given by the {@code reader}, into a {@code List} 293 * of CSV lines. The CSV is split at line breaks, as long as they are not 294 * part of a quoted column. For reading the CSV lines, the default quote 295 * character, {@link Quote#DEFAULT}, is used. 296 * 297 * @see #lines(Reader) 298 * 299 * @param reader the reader stream to split into CSV lines 300 * @return the list of CSV lines 301 * @throws NullPointerException if the given {@code reader} is {@code null} 302 * @throws IOException if reading the CSV lines fails 303 */ 304 public static List<String> readAllLines(final Reader reader) 305 throws IOException 306 { 307 try (var lines = lines(reader)) { 308 return lines.toList(); 309 } catch (UncheckedIOException e) { 310 throw e.getCause(); 311 } 312 } 313 314 /** 315 * Splits the CSV file, given by the {@code reader}, into a {@code List} 316 * of CSV lines. The CSV is split at line breaks, as long as they are not 317 * part of a quoted column. For reading the CSV lines, the default quote 318 * character, {@link Quote#DEFAULT}, is used. Then each line is split into 319 * its columns using the default separator character. 320 * 321 * @see #rows(Reader) 322 * 323 * @param reader the reader stream to split into CSV lines 324 * @return the list of CSV rows 325 * @throws NullPointerException if the given {@code reader} is {@code null} 326 * @throws IOException if reading the CSV lines fails 327 */ 328 public static List<String[]> readAllRows(final Reader reader) 329 throws IOException 330 { 331 try (var rows = rows(reader)) { 332 return rows.toList(); 333 } catch (UncheckedIOException e) { 334 throw e.getCause(); 335 } 336 } 337 338 /** 339 * Parses the given CSV string into a list of <em>records</em>. The records 340 * are created from a <em>row</em> ({@code String[]} array) by applying the 341 * given {@code mapper}. 342 * 343 * @param csv the CSV string to parse 344 * @param mapper the record mapper 345 * @return the parsed record list 346 * @param <T> the record type 347 */ 348 public static <T> List<T> parse( 349 final CharSequence csv, 350 final Function<? super String[], ? extends T> mapper 351 ) { 352 requireNonNull(csv); 353 requireNonNull(mapper); 354 355 try (var rows = rows(new CharSeqReader(csv))) { 356 return rows 357 .map(mapper) 358 .collect(Collectors.toUnmodifiableList()); 359 } 360 } 361 362 /** 363 * Parses the given CSV string into a list of rows. 364 * 365 * @param csv the CSV string to parse 366 * @return the parsed CSV rows 367 */ 368 public static List<String[]> parse(final CharSequence csv) { 369 return parse(csv, Function.identity()); 370 } 371 372 /** 373 * Parses the given CSV string into a list of {@code double[]} array rows. 374 * 375 * @param csv the CSV string to parse 376 * @return the parsed double data 377 */ 378 public static List<double[]> parseDoubles(final CharSequence csv) { 379 return parse(csv, CsvSupport::toDoubles); 380 } 381 382 private static double[] toDoubles(final String[] values) { 383 final var result = new double[values.length]; 384 for (int i = 0; i < result.length; ++i) { 385 result[i] = Double.parseDouble(values[i].trim()); 386 } 387 return result; 388 } 389 390 /** 391 * Splits a given CSV {@code line} into columns. The default values for the 392 * separator and quote character are used ({@link Separator#DEFAULT}, 393 * {@link Quote#DEFAULT}) for splitting the line. 394 * 395 * @param line the CSV line to split 396 * @return the split CSV lines 397 * @throws NullPointerException if the given {@code line} is {@code null} 398 */ 399 public static String[] split(final CharSequence line) { 400 return new LineSplitter().split(line); 401 } 402 403 /** 404 * Joins the given CSV {@code columns} to one CSV line. The default values 405 * for the separator and quote character are used ({@link Separator#DEFAULT}, 406 * {@link Quote#DEFAULT}) for joining the columns. 407 * 408 * @see #join(Object[]) 409 * 410 * @param columns the CSV columns to join 411 * @return the CSV line, joined from the given {@code columns} 412 * @throws NullPointerException if the given {@code columns} is {@code null} 413 */ 414 public static String join(final Iterable<?> columns) { 415 return ColumnJoiner.DEFAULT.join(columns); 416 } 417 418 /** 419 * Joins the given CSV {@code columns} to one CSV line. The default values 420 * for the separator and quote character are used ({@link Separator#DEFAULT}, 421 * {@link Quote#DEFAULT}) for joining the columns. 422 * 423 * @see #join(Iterable) 424 * 425 * @param columns the CSV columns to join 426 * @return the CSV line, joined from the given {@code columns} 427 * @throws NullPointerException if the given {@code columns} is {@code null} 428 */ 429 public static String join(final Object[] columns) { 430 return ColumnJoiner.DEFAULT.join(columns); 431 } 432 433 /** 434 * Joins the given CSV {@code columns} to one CSV line. The default values 435 * for the separator and quote character are used ({@link Separator#DEFAULT}, 436 * {@link Quote#DEFAULT}) for joining the columns. 437 * 438 * @see #join(Iterable) 439 * @see #join(Object[]) 440 * 441 * @param columns the CSV columns to join 442 * @return the CSV line, joined from the given {@code columns} 443 * @throws NullPointerException if the given {@code columns} is {@code null} 444 */ 445 public static String join(final String... columns) { 446 return ColumnJoiner.DEFAULT.join(columns); 447 } 448 449 /** 450 * Converts the given {@code record} into its components. 451 * 452 * @param record the record to convert 453 * @return the record components 454 */ 455 public static Object[] toComponents(final Record record) { 456 try { 457 final var components = record.getClass().getRecordComponents(); 458 final var elements = new Object[components.length]; 459 for (int i = 0; i < elements.length; ++i) { 460 elements[i] = components[i].getAccessor().invoke(record); 461 } 462 463 return elements; 464 } catch (ReflectiveOperationException e) { 465 throw new IllegalArgumentException(e); 466 } 467 } 468 469 /** 470 * Return a collector for joining a list of CSV rows into one CSV string. 471 * 472 * @return a collector for joining a list of CSV rows into one CSV string 473 */ 474 public static Collector<CharSequence, ?, String> toCsv() { 475 return toCsv(EOL); 476 } 477 478 /** 479 * Return a collector for joining a list of CSV rows into one CSV string. 480 * For the line breaks, the given {@code eol} sequence is used. 481 * 482 * @param eol the end of line sequence used for line breaks 483 * @return a collector for joining a list of CSV rows into one CSV string 484 */ 485 public static Collector<CharSequence, ?, String> toCsv(String eol) { 486 if (eol.isEmpty()) { 487 throw new IllegalArgumentException("EOL must not be empty."); 488 } 489 for (int i = 0; i < eol.length(); ++i) { 490 if (!isLineBreak(eol.charAt(i))) { 491 throw new IllegalArgumentException( 492 "EOl contains non-linebreak char: '%s'.".formatted(eol) 493 ); 494 } 495 } 496 497 return Collectors.joining(eol, "", eol); 498 } 499 500 501 /* ************************************************************************* 502 * Base CSV classes. 503 * ************************************************************************/ 504 505 /** 506 * This class reads CSV files and splits it into lines. It takes a quote 507 * character as a parameter, which is necessary for not splitting on quoted 508 * line feeds. 509 * {@snippet lang="java": 510 * final var csv = """ 511 * 0.0,0.0000 512 * 0.1,0.0740 513 * 0.2,0.1120 514 * 0.3,0.1380 515 * 0.4,0.1760 516 * 0.5,0.2500 517 * 0.6,0.3840 518 * 0.7,0.6020 519 * 0.8,0.9280 520 * 0.9,1.3860 521 * 1.0,2.0000 522 * """; 523 * 524 * final var reader = new LineReader(new Quote('"')); 525 * try (Stream<String> lines = reader.read(new StringReader(csv))) { 526 * lines.forEach(System.out::println); 527 * } 528 * } 529 * 530 * @apiNote 531 * This reader obeys <em>escaped</em> line breaks according 532 * <a href="https://tools.ietf.org/html/rfc4180">RFC-4180</a>. 533 * 534 * @version 8.1 535 * @since 8.1 536 */ 537 public static final class LineReader { 538 539 private static final LineReader DEFAULT = new LineReader(Quote.DEFAULT); 540 541 private final Quote quote; 542 543 /** 544 * Create a new line-reader with the given {@code quote} character, 545 * which is used in the CSV file which is read. 546 * 547 * @param quote the quoting character 548 * @throws NullPointerException if the {@code quote} character is 549 * {@code null} 550 */ 551 public LineReader(final Quote quote) { 552 this.quote = requireNonNull(quote); 553 } 554 555 /** 556 * Create a new line reader with default quote character {@code '"'} 557 * ({@link Quote#DEFAULT}). 558 */ 559 public LineReader() { 560 this(Quote.DEFAULT); 561 } 562 563 /** 564 * Reads all CSV lines from the given {@code reader}. 565 * 566 * @apiNote 567 * This method must be used within a try-with-resources statement or 568 * similar control structure to ensure that the stream's open file is 569 * closed promptly after the stream's operations have completed. 570 * 571 * @param reader the reader from which to read the CSV content 572 * @return the CSV lines from the file as a {@code Stream} 573 */ 574 public Stream<String> read(final Reader reader) { 575 requireNonNull(reader); 576 577 final var result = new IOValue<>(resources -> { 578 final var br = reader instanceof BufferedReader r 579 ? resources.use(r) 580 : resources.use(new BufferedReader(reader)); 581 582 final var line = new StringBuilder(); 583 final Supplier<String> nextLine = () -> { 584 try { 585 return nextLine(br, line) ? line.toString() : null; 586 } catch (IOException e) { 587 throw new UncheckedIOException(e); 588 } 589 }; 590 591 return Stream.generate(nextLine) 592 .takeWhile(Objects::nonNull); 593 }); 594 595 return result.get().onClose(() -> 596 result.release(UncheckedIOException::new) 597 ); 598 } 599 600 private boolean nextLine(final Reader reader, final StringBuilder line) 601 throws IOException 602 { 603 line.setLength(0); 604 605 boolean quoted = false; 606 boolean escaped = false; 607 boolean eol = false; 608 609 int next = -2; 610 int i = 0; 611 612 while (next >= 0 || (i = reader.read()) != -1) { 613 final char current = next != -2 ? (char)next : (char)i; 614 next = -2; 615 616 if (isLineBreak(current)) { 617 if (quoted) { 618 line.append(current); 619 } else { 620 eol = true; 621 } 622 } else if (current == quote.value) { 623 if (quoted) { 624 if (!escaped && (next = reader.read()) == quote.value) { 625 escaped = true; 626 } else { 627 if (escaped) { 628 escaped = false; 629 } else { 630 quoted = false; 631 } 632 } 633 } else { 634 quoted = true; 635 } 636 line.append(current); 637 } else { 638 line.append(current); 639 } 640 641 if (eol) { 642 eol = false; 643 if (!line.isEmpty()) { 644 return true; 645 } 646 } 647 } 648 649 return !line.isEmpty(); 650 } 651 } 652 653 /** 654 * Splitting a CSV line into columns (records). 655 * <h2>Examples</h2> 656 * <b>Simple usage</b> 657 * {@snippet class="Snippets" region="LineSplitterSnippets.simpleSplit"} 658 * 659 * <b>Projecting and re-ordering columns</b> 660 * {@snippet class="Snippets" region="LineSplitterSnippets.projectingSplit"} 661 * 662 * @apiNote 663 * A line splitter ist <b>not</b> thread-safe and can't be shared between 664 * different threads. 665 * 666 * @version 8.1 667 * @since 8.1 668 */ 669 public static final class LineSplitter { 670 671 private final ColumnList columns; 672 private final Separator separator; 673 private final Quote quote; 674 675 /** 676 * Create a new line splitter with the given parameters. 677 * 678 * @param separator the separator character used by the CSV line to split 679 * @param quote the quote character used by the CSV line to split 680 * @param projection the column indexes which should be part of the split 681 * result 682 * @throws NullPointerException if one of the parameters is {@code null} 683 */ 684 public LineSplitter( 685 final Separator separator, 686 final Quote quote, 687 final ColumnIndexes projection 688 ) { 689 if (separator.value == quote.value) { 690 throw new IllegalArgumentException( 691 "Separator and quote char must be different: %s == %s." 692 .formatted(separator.value, quote.value) 693 ); 694 } 695 696 this.columns = new ColumnList(projection); 697 this.separator = separator; 698 this.quote = quote; 699 } 700 701 /** 702 * Create a new line splitter with the given parameters. 703 * 704 * @param separator the separator character used by the CSV line to split 705 * @param quote the quote character used by the CSV line to split 706 * @throws NullPointerException if one of the parameters is {@code null} 707 */ 708 public LineSplitter(final Separator separator, final Quote quote) { 709 this(separator, quote, ColumnIndexes.ALL); 710 } 711 712 /** 713 * Create a new line splitter with the given parameters. The default 714 * quote character, {@link Quote#DEFAULT}, will be used by the created 715 * splitter. 716 * 717 * @param separator the separator character used by the CSV line to split 718 * @throws NullPointerException if one of the parameters is {@code null} 719 */ 720 public LineSplitter(final Separator separator) { 721 this(separator, Quote.DEFAULT, ColumnIndexes.ALL); 722 } 723 724 /** 725 * Create a new line splitter with the given parameters. The default 726 * separator character, {@link Separator#DEFAULT}, will be used by the 727 * created splitter. 728 * 729 * @param quote the quote character used by the CSV line to split 730 * @throws NullPointerException if one of the parameters is {@code null} 731 */ 732 public LineSplitter(final Quote quote) { 733 this(Separator.DEFAULT, quote, ColumnIndexes.ALL); 734 } 735 736 /** 737 * Create a new line splitter with the given parameters. Only the defined 738 * columns will be part of the split result and the default separator 739 * character, {@link Separator#DEFAULT}, and default quote character, 740 * {@link Quote#DEFAULT}, is used by the created splitter. 741 * 742 * @param projection the column indexes which should be part of the split 743 * result 744 * @throws NullPointerException if one of the parameters is {@code null} 745 */ 746 public LineSplitter(final ColumnIndexes projection) { 747 this(Separator.DEFAULT, Quote.DEFAULT, projection); 748 } 749 750 /** 751 * Create a new line splitter with default values. 752 */ 753 public LineSplitter() { 754 this(Separator.DEFAULT, Quote.DEFAULT, ColumnIndexes.ALL); 755 } 756 757 /** 758 * Splitting the given CSV {@code line} into its columns. 759 * 760 * @param line the CSV line to split 761 * @return the split CSV columns 762 * @throws NullPointerException if the CSV {@code line} is {@code null} 763 */ 764 public String[] split(final CharSequence line) { 765 columns.clear(); 766 final StringBuilder column = new StringBuilder(); 767 768 boolean quoted = false; 769 boolean escaped = false; 770 boolean full = false; 771 772 int quoteIndex = 0; 773 774 for (int i = 0, n = line.length(); i < n && !full; ++i) { 775 final int previous = i > 0 ? line.charAt(i - 1) : -1; 776 final char current = line.charAt(i); 777 final int next = i + 1 < line.length() ? line.charAt(i + 1) : -1; 778 779 if (current == quote.value) { 780 if (quoted) { 781 if (!escaped && quote.value == next) { 782 escaped = true; 783 } else { 784 if (escaped) { 785 column.append(quote.value); 786 escaped = false; 787 } else { 788 if (next != -1 && separator.value != next) { 789 throw new IllegalArgumentException(""" 790 Only separator character, '%s', allowed \ 791 after quote, but found '%c'. 792 %s 793 """.formatted( 794 separator.value, 795 next, 796 toErrorDesc(line, i + 1) 797 ) 798 ); 799 } 800 801 add(column); 802 full = columns.isFull(); 803 quoted = false; 804 } 805 } 806 } else { 807 if (previous != -1 && separator.value != previous) { 808 throw new IllegalArgumentException(""" 809 Only separator character, '%s', allowed before \ 810 quote, but found '%c'. 811 %s 812 """.formatted( 813 separator.value, 814 previous, 815 toErrorDesc(line, Math.max(i - 1, 0)) 816 ) 817 ); 818 } 819 quoted = true; 820 quoteIndex = i; 821 } 822 } else if (current == separator.value) { 823 if (quoted) { 824 column.append(current); 825 } else if (separator.value == previous || previous == -1) { 826 add(column); 827 full = columns.isFull(); 828 } 829 } else { 830 int j = i; 831 832 // Read till the next token separator. 833 char c; 834 while (j < n && !isTokenSeparator(c = line.charAt(j))) { 835 column.append(c); 836 ++j; 837 } 838 if (j != i) { 839 i = j - 1; 840 } 841 if (!quoted) { 842 add(column); 843 full = columns.isFull(); 844 } 845 } 846 } 847 848 if (quoted) { 849 throw new IllegalArgumentException(""" 850 Unbalanced quote character. 851 %s 852 """.formatted(toErrorDesc(line, quoteIndex)) 853 ); 854 } 855 if (line.isEmpty() || 856 separator.value == line.charAt(line.length() - 1)) 857 { 858 add(column); 859 } 860 861 return columns.toArray(); 862 } 863 864 private void add(final StringBuilder column) { 865 columns.add(column.toString()); 866 column.setLength(0); 867 } 868 869 private boolean isTokenSeparator(final char c) { 870 return c == separator.value || c == quote.value; 871 } 872 873 private static String toErrorDesc(final CharSequence line, final int pos) { 874 return """ 875 %s 876 %s 877 """.formatted( 878 line.toString().stripTrailing(), 879 " ".repeat(pos) + "^" 880 ); 881 } 882 } 883 884 885 /** 886 * Column collection, which is backed up by a string list. 887 */ 888 private static final class ColumnList { 889 private final List<String> columns = new ArrayList<>(); 890 private final ColumnIndexes projection; 891 892 private int index = 0; 893 private int count = 0; 894 895 ColumnList(final ColumnIndexes projection) { 896 this.projection = requireNonNull(projection); 897 } 898 899 /** 900 * Appends a {@code column} to the column collection. 901 * 902 * @param column the column to add 903 */ 904 void add(String column) { 905 if (!isFull()) { 906 count += set(column, index++); 907 } 908 } 909 910 private int set(String element, int column) { 911 int updated = 0; 912 913 if (projection.values.length == 0) { 914 columns.add(element); 915 ++updated; 916 } else { 917 int pos = -1; 918 while ((pos = indexOf(projection.values, pos + 1, column)) != -1) { 919 for (int i = columns.size(); i <= pos; ++i) { 920 columns.add(null); 921 } 922 columns.set(pos, element); 923 ++updated; 924 } 925 } 926 927 return updated; 928 } 929 930 private static int indexOf(int[] array, int start, int value) { 931 for (int i = start; i < array.length; ++i) { 932 if (array[i] == value) { 933 return i; 934 } 935 } 936 937 return -1; 938 } 939 940 /** 941 * Checks whether another column can be added. 942 * 943 * @return {@code true} if another column can be added to this 944 * collection, {@code false} otherwise 945 */ 946 boolean isFull() { 947 return 948 projection.values.length > 0 && 949 projection.values.length <= count; 950 } 951 952 /** 953 * Removes all columns. 954 */ 955 public void clear() { 956 columns.clear(); 957 index = 0; 958 count = 0; 959 } 960 961 String[] toArray() { 962 for (int i = columns.size(); i < projection.values.length; ++i) { 963 columns.add(null); 964 } 965 return columns.toArray(String[]::new); 966 } 967 968 } 969 970 /** 971 * This class joins an array of columns into one CSV line. 972 * 973 * <h2>Examples</h2> 974 * <b>Simple usage</b> 975 * {@snippet class="Snippets" region="ColumnJoinerSnippets.simpleJoin"} 976 * 977 * <b>Embedding and re-ordering data</b> 978 * {@snippet class="Snippets" region="ColumnJoinerSnippets.embedToCsv"} 979 * 980 * @apiNote 981 * The column joiner is <em>thread-safe</em> and can be shared between 982 * different threads. 983 * 984 * @version 8.1 985 * @since 8.1 986 */ 987 public static final class ColumnJoiner { 988 989 /** 990 * Default column joiner, which is using default separator character, 991 * {@link Separator#DEFAULT}, and default quote character, 992 * {@link Quote#DEFAULT}. 993 */ 994 public static final ColumnJoiner DEFAULT = new ColumnJoiner( 995 Separator.DEFAULT, 996 Quote.DEFAULT, 997 ColumnIndexes.ALL 998 ); 999 1000 /** 1001 * The CSV line splitter parameter. 1002 * 1003 * @param separator the column separator char 1004 * @param quote the qute char 1005 * @param embedding the column indices to read. If empty, all split 1006 * columns are used. 1007 */ 1008 private record Param(char separator, char quote, int... embedding) { 1009 1010 private String escape(Object value) { 1011 final var quoteStr = String.valueOf(quote); 1012 1013 if (value == null) { 1014 return ""; 1015 } else { 1016 var stringValue = value.toString(); 1017 var string = stringValue.replace(quoteStr, quoteStr + quoteStr); 1018 1019 if (stringValue.length() != string.length() || mustEscape(string)) { 1020 return quoteStr + string + quoteStr; 1021 } else { 1022 return stringValue; 1023 } 1024 } 1025 } 1026 1027 private boolean mustEscape(CharSequence value) { 1028 for (int i = 0; i < value.length(); ++i) { 1029 final char c = value.charAt(i); 1030 if (c == separator || isLineBreak(c)) { 1031 return true; 1032 } 1033 } 1034 return false; 1035 } 1036 } 1037 1038 private final Param param; 1039 private final int columnCount; 1040 1041 /** 1042 * Create a new column joiner with the given parameters. 1043 * 1044 * @param separator the CSV separator character used by the joiner 1045 * @param quote the CSV quote character used by the joiner 1046 * @param embedding the column indexes to join 1047 * @throws NullPointerException if one of the parameters is {@code null} 1048 */ 1049 public ColumnJoiner( 1050 final Separator separator, 1051 final Quote quote, 1052 final ColumnIndexes embedding 1053 ) { 1054 if (separator.value == quote.value) { 1055 throw new IllegalArgumentException( 1056 "Separator and quote char must be different: %s == %s." 1057 .formatted(separator.value, quote.value) 1058 ); 1059 } 1060 1061 param = new Param(separator.value, quote.value, embedding.values); 1062 columnCount = Math.max(max(param.embedding) + 1, 0); 1063 } 1064 1065 /** 1066 * Create a new column joiner with the given parameters. 1067 * 1068 * @param separator the CSV separator character used by the joiner 1069 * @param quote the CSV quote character used by the joiner 1070 * @throws NullPointerException if one of the parameters is {@code null} 1071 */ 1072 public ColumnJoiner(final Separator separator, final Quote quote) { 1073 this(separator, quote, ColumnIndexes.ALL); 1074 } 1075 1076 /** 1077 * Create a new column joiner with the given parameters. 1078 * 1079 * @param separator the CSV separator character used by the joiner 1080 * @throws NullPointerException if one of the parameters is {@code null} 1081 */ 1082 public ColumnJoiner(final Separator separator) { 1083 this(separator, Quote.DEFAULT, ColumnIndexes.ALL); 1084 } 1085 1086 /** 1087 * Create a new column joiner with the given parameters. 1088 * 1089 * @param separator the CSV separator character used by the joiner 1090 * @param embedding the column indexes to join 1091 * @throws NullPointerException if one of the parameters is {@code null} 1092 */ 1093 public ColumnJoiner(final Separator separator, final ColumnIndexes embedding) { 1094 this(separator, Quote.DEFAULT, embedding); 1095 } 1096 1097 1098 /** 1099 * Create a new column joiner with the given parameters. 1100 * 1101 * @param quote the CSV quote character used by the joiner 1102 * @throws NullPointerException if one of the parameters is {@code null} 1103 */ 1104 public ColumnJoiner(final Quote quote) { 1105 this(Separator.DEFAULT, quote, ColumnIndexes.ALL); 1106 } 1107 1108 /** 1109 * Create a new column joiner with the given <em>embedding</em> column 1110 * indexes. 1111 * 1112 * @param embedding the embedding column indexes 1113 */ 1114 public ColumnJoiner(final ColumnIndexes embedding) { 1115 this(Separator.DEFAULT, Quote.DEFAULT, embedding); 1116 } 1117 1118 /** 1119 * Create a new column joiner with the given parameters. 1120 * 1121 * @param quote the CSV quote character used by the joiner 1122 * @param embedding the column indexes to join 1123 * @throws NullPointerException if one of the parameters is {@code null} 1124 */ 1125 public ColumnJoiner(final Quote quote, final ColumnIndexes embedding) { 1126 this(Separator.DEFAULT, quote, embedding); 1127 } 1128 1129 private static int max(int[] array) { 1130 int max = Integer.MIN_VALUE; 1131 for (int value : array) { 1132 if (value > max) { 1133 max = value; 1134 } 1135 } 1136 return max; 1137 } 1138 1139 /** 1140 * Joins the given CSV {@code columns}, using the given separator and 1141 * quote character. 1142 * 1143 * @param columns the CSV columns to join 1144 * @return the joined CSV columns 1145 */ 1146 public String join(final Iterable<?> columns) { 1147 if (param.embedding.length == 0) { 1148 return join0(columns); 1149 } else { 1150 final var values = new Object[columnCount]; 1151 final var it = columns.iterator(); 1152 int i = 0; 1153 while (it.hasNext() && i < param.embedding.length) { 1154 final var col = it.next(); 1155 final var index = param.embedding[i++]; 1156 if (index >= 0) { 1157 values[index] = col; 1158 } 1159 } 1160 1161 return join0(Arrays.asList(values)); 1162 } 1163 } 1164 1165 private String join0(final Iterable<?> cols) { 1166 final var row = new StringBuilder(); 1167 final var it = cols.iterator(); 1168 while (it.hasNext()) { 1169 final var column = it.next(); 1170 row.append(param.escape(column)); 1171 if (it.hasNext()) { 1172 row.append(param.separator); 1173 } 1174 } 1175 1176 return row.toString(); 1177 } 1178 1179 /** 1180 * Joins the given CSV {@code columns}, using the given separator and 1181 * quote character. 1182 * 1183 * @param columns the CSV columns to join 1184 * @return the joined CSV columns 1185 */ 1186 public String join(final Object[] columns) { 1187 return join(Arrays.asList(columns)); 1188 } 1189 } 1190 1191 /** 1192 * Simple and fast char-sequence reader. 1193 */ 1194 static final class CharSeqReader extends Reader { 1195 private static final int EOF = -1; 1196 1197 private final CharSequence seq; 1198 private int idx; 1199 1200 CharSeqReader(final CharSequence seq) { 1201 this.seq = requireNonNull(seq); 1202 } 1203 1204 @Override 1205 public int read() { 1206 return idx >= seq.length() ? EOF : seq.charAt(idx++); 1207 } 1208 1209 @Override 1210 public int read(final char[] cbuf, final int offset, final int length) { 1211 requireNonNull(cbuf); 1212 1213 if (idx >= seq.length()) { 1214 return EOF; 1215 } else if (length >= 0 && offset >= 0 && offset + length <= cbuf.length) { 1216 int count = 0; 1217 1218 for(int i = 0; i < length; ++i) { 1219 int c = read(); 1220 if (c == EOF) { 1221 return count; 1222 } 1223 1224 cbuf[offset + i] = (char)c; 1225 ++count; 1226 } 1227 1228 return count; 1229 } else { 1230 throw new IndexOutOfBoundsException( 1231 "Buffer size=%d, offset=%d, length=%d." 1232 .formatted(cbuf.length, offset, length) 1233 ); 1234 } 1235 } 1236 1237 @Override 1238 public void close() { 1239 } 1240 } 1241 1242} 1243 1244