001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.csv; 019 020import static org.apache.commons.csv.Token.Type.TOKEN; 021 022import java.io.Closeable; 023import java.io.File; 024import java.io.IOException; 025import java.io.InputStream; 026import java.io.InputStreamReader; 027import java.io.Reader; 028import java.io.StringReader; 029import java.net.URL; 030import java.nio.charset.Charset; 031import java.nio.file.Files; 032import java.nio.file.Path; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.Collections; 036import java.util.Iterator; 037import java.util.LinkedHashMap; 038import java.util.List; 039import java.util.Map; 040import java.util.NoSuchElementException; 041import java.util.Objects; 042import java.util.Spliterator; 043import java.util.Spliterators; 044import java.util.TreeMap; 045import java.util.stream.Stream; 046import java.util.stream.StreamSupport; 047 048/** 049 * Parses CSV files according to the specified format. 050 * 051 * Because CSV appears in many different dialects, the parser supports many formats by allowing the 052 * specification of a {@link CSVFormat}. 053 * 054 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream. 055 * 056 * <h2>Creating instances</h2> 057 * <p> 058 * There are several static factory methods that can be used to create instances for various types of resources: 059 * </p> 060 * <ul> 061 * <li>{@link #parse(java.io.File, Charset, CSVFormat)}</li> 062 * <li>{@link #parse(String, CSVFormat)}</li> 063 * <li>{@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}</li> 064 * </ul> 065 * <p> 066 * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. 067 * 068 * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: 069 * </p> 070 * <pre> 071 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) { 072 * ... 073 * } 074 * </pre> 075 * 076 * <h2>Parsing record wise</h2> 077 * <p> 078 * To parse a CSV input from a file, you write: 079 * </p> 080 * 081 * <pre> 082 * File csvData = new File("/path/to/csv"); 083 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180); 084 * for (CSVRecord csvRecord : parser) { 085 * ... 086 * } 087 * </pre> 088 * 089 * <p> 090 * This will read the parse the contents of the file using the 091 * <a href="http://tools.ietf.org/html/rfc4180" target="_blank">RFC 4180</a> format. 092 * </p> 093 * 094 * <p> 095 * To parse CSV input in a format like Excel, you write: 096 * </p> 097 * 098 * <pre> 099 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL); 100 * for (CSVRecord csvRecord : parser) { 101 * ... 102 * } 103 * </pre> 104 * 105 * <p> 106 * If the predefined formats don't match the format at hands, custom formats can be defined. More information about 107 * customising CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}. 108 * </p> 109 * 110 * <h2>Parsing into memory</h2> 111 * <p> 112 * If parsing record wise is not desired, the contents of the input can be read completely into memory. 113 * </p> 114 * 115 * <pre> 116 * Reader in = new StringReader("a;b\nc;d"); 117 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL); 118 * List<CSVRecord> list = parser.getRecords(); 119 * </pre> 120 * 121 * <p> 122 * There are two constraints that have to be kept in mind: 123 * </p> 124 * 125 * <ol> 126 * <li>Parsing into memory starts at the current position of the parser. If you have already parsed records from 127 * the input, those records will not end up in the in memory representation of your CSV data.</li> 128 * <li>Parsing into memory may consume a lot of system resources depending on the input. For example if you're 129 * parsing a 150MB file of CSV data the contents will be read completely into memory.</li> 130 * </ol> 131 * 132 * <h2>Notes</h2> 133 * <p> 134 * Internal parser state is completely covered by the format and the reader-state. 135 * </p> 136 * 137 * @see <a href="package-summary.html">package documentation for more details</a> 138 */ 139public final class CSVParser implements Iterable<CSVRecord>, Closeable { 140 141 class CSVRecordIterator implements Iterator<CSVRecord> { 142 private CSVRecord current; 143 144 private CSVRecord getNextRecord() { 145 try { 146 return CSVParser.this.nextRecord(); 147 } catch (final IOException e) { 148 throw new IllegalStateException( 149 e.getClass().getSimpleName() + " reading next record: " + e.toString(), e); 150 } 151 } 152 153 @Override 154 public boolean hasNext() { 155 if (CSVParser.this.isClosed()) { 156 return false; 157 } 158 if (this.current == null) { 159 this.current = this.getNextRecord(); 160 } 161 162 return this.current != null; 163 } 164 165 @Override 166 public CSVRecord next() { 167 if (CSVParser.this.isClosed()) { 168 throw new NoSuchElementException("CSVParser has been closed"); 169 } 170 CSVRecord next = this.current; 171 this.current = null; 172 173 if (next == null) { 174 // hasNext() wasn't called before 175 next = this.getNextRecord(); 176 if (next == null) { 177 throw new NoSuchElementException("No more CSV records available"); 178 } 179 } 180 181 return next; 182 } 183 184 @Override 185 public void remove() { 186 throw new UnsupportedOperationException(); 187 } 188 } 189 190 /** 191 * Header information based on name and position. 192 */ 193 private static final class Headers { 194 /** 195 * Header column positions (0-based) 196 */ 197 final Map<String, Integer> headerMap; 198 199 /** 200 * Header names in column order 201 */ 202 final List<String> headerNames; 203 204 Headers(final Map<String, Integer> headerMap, final List<String> headerNames) { 205 this.headerMap = headerMap; 206 this.headerNames = headerNames; 207 } 208 } 209 210 /** 211 * Creates a parser for the given {@link File}. 212 * 213 * @param file 214 * a CSV file. Must not be null. 215 * @param charset 216 * The Charset to decode the given file. 217 * @param format 218 * the CSVFormat used for CSV parsing. Must not be null. 219 * @return a new parser 220 * @throws IllegalArgumentException 221 * If the parameters of the format are inconsistent or if either file or format are null. 222 * @throws IOException 223 * If an I/O error occurs 224 */ 225 public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { 226 Objects.requireNonNull(file, "file"); 227 return parse(file.toPath(), charset, format); 228 } 229 230 /** 231 * Creates a CSV parser using the given {@link CSVFormat}. 232 * 233 * <p> 234 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 235 * unless you close the {@code reader}. 236 * </p> 237 * 238 * @param inputStream 239 * an InputStream containing CSV-formatted input. Must not be null. 240 * @param charset 241 * The Charset to decode the given file. 242 * @param format 243 * the CSVFormat used for CSV parsing. Must not be null. 244 * @return a new CSVParser configured with the given reader and format. 245 * @throws IllegalArgumentException 246 * If the parameters of the format are inconsistent or if either reader or format are null. 247 * @throws IOException 248 * If there is a problem reading the header or skipping the first record 249 * @since 1.5 250 */ 251 @SuppressWarnings("resource") 252 public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) 253 throws IOException { 254 Objects.requireNonNull(inputStream, "inputStream"); 255 Objects.requireNonNull(format, "format"); 256 return parse(new InputStreamReader(inputStream, charset), format); 257 } 258 259 /** 260 * Creates and returns a parser for the given {@link Path}, which the caller MUST close. 261 * 262 * @param path 263 * a CSV file. Must not be null. 264 * @param charset 265 * The Charset to decode the given file. 266 * @param format 267 * the CSVFormat used for CSV parsing. Must not be null. 268 * @return a new parser 269 * @throws IllegalArgumentException 270 * If the parameters of the format are inconsistent or if either file or format are null. 271 * @throws IOException 272 * If an I/O error occurs 273 * @since 1.5 274 */ 275 @SuppressWarnings("resource") 276 public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { 277 Objects.requireNonNull(path, "path"); 278 Objects.requireNonNull(format, "format"); 279 return parse(Files.newInputStream(path), charset, format); 280 } 281 282 /** 283 * Creates a CSV parser using the given {@link CSVFormat} 284 * 285 * <p> 286 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 287 * unless you close the {@code reader}. 288 * </p> 289 * 290 * @param reader 291 * a Reader containing CSV-formatted input. Must not be null. 292 * @param format 293 * the CSVFormat used for CSV parsing. Must not be null. 294 * @return a new CSVParser configured with the given reader and format. 295 * @throws IllegalArgumentException 296 * If the parameters of the format are inconsistent or if either reader or format are null. 297 * @throws IOException 298 * If there is a problem reading the header or skipping the first record 299 * @since 1.5 300 */ 301 public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException { 302 return new CSVParser(reader, format); 303 } 304 305 // the following objects are shared to reduce garbage 306 307 /** 308 * Creates a parser for the given {@link String}. 309 * 310 * @param string 311 * a CSV string. Must not be null. 312 * @param format 313 * the CSVFormat used for CSV parsing. Must not be null. 314 * @return a new parser 315 * @throws IllegalArgumentException 316 * If the parameters of the format are inconsistent or if either string or format are null. 317 * @throws IOException 318 * If an I/O error occurs 319 */ 320 public static CSVParser parse(final String string, final CSVFormat format) throws IOException { 321 Objects.requireNonNull(string, "string"); 322 Objects.requireNonNull(format, "format"); 323 324 return new CSVParser(new StringReader(string), format); 325 } 326 327 /** 328 * Creates and returns a parser for the given URL, which the caller MUST close. 329 * 330 * <p> 331 * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless 332 * you close the {@code url}. 333 * </p> 334 * 335 * @param url 336 * a URL. Must not be null. 337 * @param charset 338 * the charset for the resource. Must not be null. 339 * @param format 340 * the CSVFormat used for CSV parsing. Must not be null. 341 * @return a new parser 342 * @throws IllegalArgumentException 343 * If the parameters of the format are inconsistent or if either url, charset or format are null. 344 * @throws IOException 345 * If an I/O error occurs 346 */ 347 @SuppressWarnings("resource") 348 public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { 349 Objects.requireNonNull(url, "url"); 350 Objects.requireNonNull(charset, "charset"); 351 Objects.requireNonNull(format, "format"); 352 353 return new CSVParser(new InputStreamReader(url.openStream(), charset), format); 354 } 355 356 private final CSVFormat format; 357 358 private final Headers headers; 359 360 private final Lexer lexer; 361 362 private final CSVRecordIterator csvRecordIterator; 363 364 /** A record buffer for getRecord(). Grows as necessary and is reused. */ 365 private final List<String> recordList = new ArrayList<>(); 366 367 /** 368 * The next record number to assign. 369 */ 370 private long recordNumber; 371 372 /** 373 * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination 374 * with {@link #recordNumber}. 375 */ 376 private final long characterOffset; 377 378 private final Token reusableToken = new Token(); 379 380 /** 381 * Customized CSV parser using the given {@link CSVFormat} 382 * 383 * <p> 384 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 385 * unless you close the {@code reader}. 386 * </p> 387 * 388 * @param reader 389 * a Reader containing CSV-formatted input. Must not be null. 390 * @param format 391 * the CSVFormat used for CSV parsing. Must not be null. 392 * @throws IllegalArgumentException 393 * If the parameters of the format are inconsistent or if either reader or format are null. 394 * @throws IOException 395 * If there is a problem reading the header or skipping the first record 396 */ 397 public CSVParser(final Reader reader, final CSVFormat format) throws IOException { 398 this(reader, format, 0, 1); 399 } 400 401 /** 402 * Customized CSV parser using the given {@link CSVFormat} 403 * 404 * <p> 405 * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, 406 * unless you close the {@code reader}. 407 * </p> 408 * 409 * @param reader 410 * a Reader containing CSV-formatted input. Must not be null. 411 * @param format 412 * the CSVFormat used for CSV parsing. Must not be null. 413 * @param characterOffset 414 * Lexer offset when the parser does not start parsing at the beginning of the source. 415 * @param recordNumber 416 * The next record number to assign 417 * @throws IllegalArgumentException 418 * If the parameters of the format are inconsistent or if either reader or format are null. 419 * @throws IOException 420 * If there is a problem reading the header or skipping the first record 421 * @since 1.1 422 */ 423 @SuppressWarnings("resource") 424 public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) 425 throws IOException { 426 Objects.requireNonNull(reader, "reader"); 427 Objects.requireNonNull(format, "format"); 428 429 this.format = format.copy(); 430 this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); 431 this.csvRecordIterator = new CSVRecordIterator(); 432 this.headers = createHeaders(); 433 this.characterOffset = characterOffset; 434 this.recordNumber = recordNumber - 1; 435 } 436 437 private void addRecordValue(final boolean lastRecord) { 438 final String input = this.reusableToken.content.toString(); 439 final String inputClean = this.format.getTrim() ? input.trim() : input; 440 if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) { 441 return; 442 } 443 this.recordList.add(handleNull(inputClean)); 444 } 445 446 /** 447 * Closes resources. 448 * 449 * @throws IOException 450 * If an I/O error occurs 451 */ 452 @Override 453 public void close() throws IOException { 454 if (this.lexer != null) { 455 this.lexer.close(); 456 } 457 } 458 459 private Map<String, Integer> createEmptyHeaderMap() { 460 return this.format.getIgnoreHeaderCase() ? 461 new TreeMap<>(String.CASE_INSENSITIVE_ORDER) : 462 new LinkedHashMap<>(); 463 } 464 465 /** 466 * Creates the name to index mapping if the format defines a header. 467 * 468 * @return null if the format has no header. 469 * @throws IOException if there is a problem reading the header or skipping the first record 470 */ 471 private Headers createHeaders() throws IOException { 472 Map<String, Integer> hdrMap = null; 473 List<String> headerNames = null; 474 final String[] formatHeader = this.format.getHeader(); 475 if (formatHeader != null) { 476 hdrMap = createEmptyHeaderMap(); 477 String[] headerRecord = null; 478 if (formatHeader.length == 0) { 479 // read the header from the first line of the file 480 final CSVRecord nextRecord = this.nextRecord(); 481 if (nextRecord != null) { 482 headerRecord = nextRecord.values(); 483 } 484 } else { 485 if (this.format.getSkipHeaderRecord()) { 486 this.nextRecord(); 487 } 488 headerRecord = formatHeader; 489 } 490 491 // build the name to index mappings 492 if (headerRecord != null) { 493 for (int i = 0; i < headerRecord.length; i++) { 494 final String header = headerRecord[i]; 495 final boolean emptyHeader = header == null || header.trim().isEmpty(); 496 if (emptyHeader && !this.format.getAllowMissingColumnNames()) { 497 throw new IllegalArgumentException( 498 "A header name is missing in " + Arrays.toString(headerRecord)); 499 } 500 // Note: This will always allow a duplicate header if the header is empty 501 final boolean containsHeader = header != null && hdrMap.containsKey(header); 502 if (containsHeader && !emptyHeader && !this.format.getAllowDuplicateHeaderNames()) { 503 throw new IllegalArgumentException( 504 String.format( 505 "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", 506 header, Arrays.toString(headerRecord))); 507 } 508 if (header != null) { 509 hdrMap.put(header, Integer.valueOf(i)); 510 if (headerNames == null) { 511 headerNames = new ArrayList<>(headerRecord.length); 512 } 513 headerNames.add(header); 514 } 515 } 516 } 517 } 518 if (headerNames == null) { 519 headerNames = Collections.emptyList(); //immutable 520 } else { 521 headerNames = Collections.unmodifiableList(headerNames); 522 } 523 return new Headers(hdrMap, headerNames); 524 } 525 526 /** 527 * Returns the current line number in the input stream. 528 * 529 * <p> 530 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 531 * the record number. 532 * </p> 533 * 534 * @return current line number 535 */ 536 public long getCurrentLineNumber() { 537 return this.lexer.getCurrentLineNumber(); 538 } 539 540 /** 541 * Gets the first end-of-line string encountered. 542 * 543 * @return the first end-of-line string 544 * @since 1.5 545 */ 546 public String getFirstEndOfLine() { 547 return lexer.getFirstEol(); 548 } 549 550 /** 551 * Returns a copy of the header map. 552 * <p> 553 * The map keys are column names. The map values are 0-based indices. 554 * </p> 555 * <p> 556 * Note: The map can only provide a one-to-one mapping when the format did not 557 * contain null or duplicate column names. 558 * </p> 559 * 560 * @return a copy of the header map. 561 */ 562 public Map<String, Integer> getHeaderMap() { 563 if (this.headers.headerMap == null) { 564 return null; 565 } 566 final Map<String, Integer> map = createEmptyHeaderMap(); 567 map.putAll(this.headers.headerMap); 568 return map; 569 } 570 571 /** 572 * Returns the header map. 573 * 574 * @return the header map. 575 */ 576 Map<String, Integer> getHeaderMapRaw() { 577 return this.headers.headerMap; 578 } 579 580 /** 581 * Returns a read-only list of header names that iterates in column order. 582 * <p> 583 * Note: The list provides strings that can be used as keys in the header map. 584 * The list will not contain null column names if they were present in the input 585 * format. 586 * </p> 587 * 588 * @return read-only list of header names that iterates in column order. 589 * @see #getHeaderMap() 590 * @since 1.7 591 */ 592 public List<String> getHeaderNames() { 593 return Collections.unmodifiableList(headers.headerNames); 594 } 595 596 /** 597 * Returns the current record number in the input stream. 598 * 599 * <p> 600 * <strong>ATTENTION:</strong> If your CSV input has multi-line values, the returned number does not correspond to 601 * the line number. 602 * </p> 603 * 604 * @return current record number 605 */ 606 public long getRecordNumber() { 607 return this.recordNumber; 608 } 609 610 /** 611 * Parses the CSV input according to the given format and returns the content as a list of 612 * {@link CSVRecord CSVRecords}. 613 * 614 * <p> 615 * The returned content starts at the current parse-position in the stream. 616 * </p> 617 * 618 * @return list of {@link CSVRecord CSVRecords}, may be empty 619 * @throws IOException 620 * on parse error or input read-failure 621 */ 622 public List<CSVRecord> getRecords() throws IOException { 623 CSVRecord rec; 624 final List<CSVRecord> records = new ArrayList<>(); 625 while ((rec = this.nextRecord()) != null) { 626 records.add(rec); 627 } 628 return records; 629 } 630 631 /** 632 * Handle whether input is parsed as null 633 * 634 * @param input 635 * the cell data to further processed 636 * @return null if input is parsed as null, or input itself if input isn't parsed as null 637 */ 638 private String handleNull(final String input) { 639 final boolean isQuoted = this.reusableToken.isQuoted; 640 final String nullString = format.getNullString(); 641 final boolean strictQuoteMode = isStrictQuoteMode(); 642 if (input.equals(nullString)) { 643 // nullString = NULL(String), distinguish between "NULL" and NULL in ALL_NON_NULL or NON_NUMERIC quote mode 644 return strictQuoteMode && isQuoted ? input : null; 645 } 646 // don't set nullString, distinguish between "" and ,, (absent values) in All_NON_NULL or NON_NUMERIC quote mode 647 return strictQuoteMode && nullString == null && input.isEmpty() && !isQuoted ? null : input; 648 } 649 650 /** 651 * Tests whether this parser is closed. 652 * 653 * @return whether this parser is closed. 654 */ 655 public boolean isClosed() { 656 return this.lexer.isClosed(); 657 } 658 659 /** 660 * Tests whether the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or {@link QuoteMode#NON_NUMERIC}. 661 * 662 * @return true if the format's {@link QuoteMode} is {@link QuoteMode#ALL_NON_NULL} or 663 * {@link QuoteMode#NON_NUMERIC}. 664 */ 665 private boolean isStrictQuoteMode() { 666 return this.format.getQuoteMode() == QuoteMode.ALL_NON_NULL || 667 this.format.getQuoteMode() == QuoteMode.NON_NUMERIC; 668 } 669 670 /** 671 * Returns the record iterator. 672 * 673 * <p> 674 * An {@link IOException} caught during the iteration are re-thrown as an 675 * {@link IllegalStateException}. 676 * </p> 677 * <p> 678 * If the parser is closed a call to {@link Iterator#next()} will throw a 679 * {@link NoSuchElementException}. 680 * </p> 681 */ 682 @Override 683 public Iterator<CSVRecord> iterator() { 684 return csvRecordIterator; 685 } 686 687 /** 688 * Parses the next record from the current point in the stream. 689 * 690 * @return the record as an array of values, or {@code null} if the end of the stream has been reached 691 * @throws IOException 692 * on parse error or input read-failure 693 */ 694 CSVRecord nextRecord() throws IOException { 695 CSVRecord result = null; 696 this.recordList.clear(); 697 StringBuilder sb = null; 698 final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset; 699 do { 700 this.reusableToken.reset(); 701 this.lexer.nextToken(this.reusableToken); 702 switch (this.reusableToken.type) { 703 case TOKEN: 704 this.addRecordValue(false); 705 break; 706 case EORECORD: 707 this.addRecordValue(true); 708 break; 709 case EOF: 710 if (this.reusableToken.isReady) { 711 this.addRecordValue(true); 712 } 713 break; 714 case INVALID: 715 throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence"); 716 case COMMENT: // Ignored currently 717 if (sb == null) { // first comment for this record 718 sb = new StringBuilder(); 719 } else { 720 sb.append(Constants.LF); 721 } 722 sb.append(this.reusableToken.content); 723 this.reusableToken.type = TOKEN; // Read another token 724 break; 725 default: 726 throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type); 727 } 728 } while (this.reusableToken.type == TOKEN); 729 730 if (!this.recordList.isEmpty()) { 731 this.recordNumber++; 732 final String comment = sb == null ? null : sb.toString(); 733 result = new CSVRecord(this, this.recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment, 734 this.recordNumber, startCharPosition); 735 } 736 return result; 737 } 738 739 /** 740 * Returns a sequential {@code Stream} with this collection as its source. 741 * 742 * @return a sequential {@code Stream} with this collection as its source. 743 * @since 1.9.0 744 */ 745 public Stream<CSVRecord> stream() { 746 return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED), false); 747 } 748 749}