// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. // This module is browser compatible. import { convertRowToObject, createBareQuoteErrorMessage, createQuoteErrorMessage, type ParseResult, type ReadOptions, type RecordWithColumn, } from "./_io.ts"; import { codePointLength } from "./_shared.ts"; export type { ParseResult, RecordWithColumn }; const BYTE_ORDER_MARK = "\ufeff"; class Parser { #input = ""; #cursor = 0; #options: { separator: string; trimLeadingSpace: boolean; comment: string | undefined; lazyQuotes: boolean | undefined; fieldsPerRecord: number | undefined; }; constructor({ separator = ",", trimLeadingSpace = false, comment, lazyQuotes, fieldsPerRecord, }: ReadOptions = {}) { this.#options = { separator, trimLeadingSpace, comment, lazyQuotes, fieldsPerRecord, }; } #readLine(): string | null { if (this.#isEOF()) return null; let buffer = ""; let hadNewline = false; while (this.#cursor < this.#input.length) { if (this.#input.startsWith("\r\n", this.#cursor)) { hadNewline = true; this.#cursor += 2; break; } if ( this.#input.startsWith("\n", this.#cursor) ) { hadNewline = true; this.#cursor += 1; break; } buffer += this.#input[this.#cursor]; this.#cursor += 1; } if (!hadNewline && buffer.endsWith("\r")) { buffer = buffer.slice(0, -1); } return buffer; } #isEOF(): boolean { return this.#cursor >= this.#input.length; } #parseRecord(zeroBasedStartLine: number): string[] | null { let fullLine = this.#readLine(); if (fullLine === null) return null; if (fullLine.length === 0) { return []; } let zeroBasedLine = zeroBasedStartLine; // line starting with comment character is ignored if (this.#options.comment && fullLine[0] === this.#options.comment) { return []; } let line = fullLine; const quote = '"'; const quoteLen = quote.length; const separatorLen = this.#options.separator.length; let recordBuffer = ""; const fieldIndexes = [] as number[]; parseField: while (true) { if (this.#options.trimLeadingSpace) { line = line.trimStart(); } if (line.length === 0 || !line.startsWith(quote)) { // Non-quoted string field const i = line.indexOf(this.#options.separator); let field = line; if (i >= 0) { field = field.substring(0, i); } // Check to make sure a quote does not appear in field. if (!this.#options.lazyQuotes) { const j = field.indexOf(quote); if (j >= 0) { const col = codePointLength( fullLine.slice(0, fullLine.length - line.slice(j).length), ); throw new SyntaxError( createBareQuoteErrorMessage( zeroBasedStartLine, zeroBasedLine, col, ), ); } } recordBuffer += field; fieldIndexes.push(recordBuffer.length); if (i >= 0) { line = line.substring(i + separatorLen); continue parseField; } break parseField; } else { // Quoted string field line = line.substring(quoteLen); while (true) { const i = line.indexOf(quote); if (i >= 0) { // Hit next quote. recordBuffer += line.substring(0, i); line = line.substring(i + quoteLen); if (line.startsWith(quote)) { // `""` sequence (append quote). recordBuffer += quote; line = line.substring(quoteLen); } else if (line.startsWith(this.#options.separator)) { // `","` sequence (end of field). line = line.substring(separatorLen); fieldIndexes.push(recordBuffer.length); continue parseField; } else if (0 === line.length) { // `"\n` sequence (end of line). fieldIndexes.push(recordBuffer.length); break parseField; } else if (this.#options.lazyQuotes) { // `"` sequence (bare quote). recordBuffer += quote; } else { // `"*` sequence (invalid non-escaped quote). const col = codePointLength( fullLine.slice(0, fullLine.length - line.length - quoteLen), ); throw new SyntaxError( createQuoteErrorMessage(zeroBasedStartLine, zeroBasedLine, col), ); } } else if (line.length > 0 || !(this.#isEOF())) { // Hit end of line (copy all data so far). recordBuffer += line; const r = this.#readLine(); line = r ?? ""; // This is a workaround for making this module behave similarly to the encoding/csv/reader.go. fullLine = line; if (r === null) { // Abrupt end of file (EOF or error). if (!this.#options.lazyQuotes) { const col = codePointLength(fullLine); throw new SyntaxError( createQuoteErrorMessage( zeroBasedStartLine, zeroBasedLine, col, ), ); } fieldIndexes.push(recordBuffer.length); break parseField; } zeroBasedLine++; recordBuffer += "\n"; // preserve line feed (This is because TextProtoReader removes it.) } else { // Abrupt end of file (EOF on error). if (!this.#options.lazyQuotes) { const col = codePointLength(fullLine); throw new SyntaxError( createQuoteErrorMessage(zeroBasedStartLine, zeroBasedLine, col), ); } fieldIndexes.push(recordBuffer.length); break parseField; } } } } const result = [] as string[]; let preIdx = 0; for (const i of fieldIndexes) { result.push(recordBuffer.slice(preIdx, i)); preIdx = i; } return result; } parse(input: string): string[][] { this.#input = input.startsWith(BYTE_ORDER_MARK) ? input.slice(1) : input; this.#cursor = 0; const result: string[][] = []; let lineResult: string[]; let first = true; let lineIndex = 0; const INVALID_RUNE = ["\r", "\n", '"']; const options = this.#options; if ( INVALID_RUNE.includes(options.separator) || (typeof options.comment === "string" && INVALID_RUNE.includes(options.comment)) || options.separator === options.comment ) { throw new Error("Cannot parse input: invalid delimiter"); } // The number of fields per record that is either inferred from the first // row (when options.fieldsPerRecord = 0), or set by the caller (when // options.fieldsPerRecord > 0). // // Each possible variant means the following: // "ANY": Variable number of fields is allowed. // "UNINITIALIZED": The first row has not been read yet. Once it's read, the // number of fields will be set. // : The number of fields per record that every record must follow. let _nbFields: "ANY" | "UNINITIALIZED" | number; if (options.fieldsPerRecord === undefined || options.fieldsPerRecord < 0) { _nbFields = "ANY"; } else if (options.fieldsPerRecord === 0) { _nbFields = "UNINITIALIZED"; } else { // TODO: Should we check if it's a valid integer? _nbFields = options.fieldsPerRecord; } while (true) { const r = this.#parseRecord(lineIndex); if (r === null) break; lineResult = r; lineIndex++; // If fieldsPerRecord is 0, Read sets it to // the number of fields in the first record if (first) { first = false; if (_nbFields === "UNINITIALIZED") { _nbFields = lineResult.length; } } if (lineResult.length > 0) { if (typeof _nbFields === "number" && _nbFields !== lineResult.length) { throw new SyntaxError( `Syntax error on line ${lineIndex}: expected ${_nbFields} fields but got ${lineResult.length}`, ); } result.push(lineResult); } } return result; } } /** Options for {@linkcode parse}. */ export interface ParseOptions { /** Character which separates values. * * @default {","} */ separator?: string; /** Character to start a comment. * * Lines beginning with the comment character without preceding whitespace * are ignored. With leading whitespace the comment character becomes part of * the field, even you provide `trimLeadingSpace: true`. * * By default, no character is considered to be a start of a comment. */ comment?: string; /** Flag to trim the leading space of the value. * * This is done even if the field delimiter, `separator`, is white space. * * @default {false} */ trimLeadingSpace?: boolean; /** * Allow unquoted quote in a quoted field or non-double-quoted quotes in * quoted field. * * @default {false} */ lazyQuotes?: boolean; /** * Enabling checking number of expected fields for each row. * * If positive, each record is required to have the given number of fields. * If 0, it will be set to the number of fields in the first row, so that * future rows must have the same field count. * If negative, no check is made and records may have a variable number of * fields. * * If the wrong number of fields is in a row, a {@linkcode SyntaxError} is * thrown. */ fieldsPerRecord?: number; /** * If you provide `skipFirstRow: true` and `columns`, the first line will be * skipped. * If you provide `skipFirstRow: true` but not `columns`, the first line will * be skipped and used as header definitions. * * @default {false} */ skipFirstRow?: boolean; /** List of names used for header definition. */ columns?: readonly string[]; } /** * Parses CSV string into an array of arrays of strings. * * @example Usage * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * * const string = "a,b,c\n#d,e,f"; * * assertEquals(parse(string), [["a", "b", "c"], ["#d", "e", "f"]]); * ``` * * @example Quoted fields * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * * const string = `"a ""word""","comma,","newline\n"\nfoo,bar,baz`; * const result = parse(string); * * assertEquals(result, [ * ['a "word"', "comma,", "newline\n"], * ["foo", "bar", "baz"] * ]); * ``` * * @param input The input to parse. * @returns The parsed data. */ export function parse(input: string): string[][]; /** * Parses CSV string into an array of objects or an array of arrays of strings. * * If `columns` or `skipFirstRow` option is provided, it returns an array of * objects, otherwise it returns an array of arrays of string. * * @example Don't skip first row with `skipFirstRow: false` * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * import { assertType, IsExact } from "@std/testing/types" * * const string = "a,b,c\nd,e,f"; * const result = parse(string, { skipFirstRow: false }); * * assertEquals(result, [["a", "b", "c"], ["d", "e", "f"]]); * assertType>(true); * ``` * * @example Skip first row with `skipFirstRow: true` * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * import { assertType, IsExact } from "@std/testing/types" * * const string = "a,b,c\nd,e,f"; * const result = parse(string, { skipFirstRow: true }); * * assertEquals(result, [{ a: "d", b: "e", c: "f" }]); * assertType[]>>(true); * ``` * * @example Specify columns with `columns` option * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * import { assertType, IsExact } from "@std/testing/types" * * const string = "a,b,c\nd,e,f"; * const result = parse(string, { columns: ["x", "y", "z"] }); * * assertEquals(result, [{ x: "a", y: "b", z: "c" }, { x: "d", y: "e", z: "f" }]); * assertType[]>>(true); * ``` * * @example Specify columns with `columns` option and skip first row with * `skipFirstRow: true` * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * import { assertType, IsExact } from "@std/testing/types" * * const string = "a,b,c\nd,e,f"; * const result = parse(string, { columns: ["x", "y", "z"], skipFirstRow: true }); * * assertEquals(result, [{ x: "d", y: "e", z: "f" }]); * assertType[]>>(true); * ``` * * @example TSV (tab-separated values) with `separator: "\t"` * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * * const string = "a\tb\tc\nd\te\tf"; * const result = parse(string, { separator: "\t" }); * * assertEquals(result, [["a", "b", "c"], ["d", "e", "f"]]); * ``` * * @example Trim leading space with `trimLeadingSpace: true` * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * * const string = " a, b, c\n"; * const result = parse(string, { trimLeadingSpace: true }); * * assertEquals(result, [["a", "b", "c"]]); * ``` * * @example Lazy quotes with `lazyQuotes: true` * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * * const string = `a "word","1"2",a","b`; * const result = parse(string, { lazyQuotes: true }); * * assertEquals(result, [['a "word"', '1"2', 'a"', 'b']]); * ``` * * @example Set comment prefix with `comment` option * ```ts * import { parse } from "@std/csv/parse"; * import { assertEquals } from "@std/assert/equals"; * * const string = "a,b,c\n# THIS IS A COMMENT LINE\nd,e,f"; * const result = parse(string, { comment: "#" }); * * assertEquals(result, [["a", "b", "c"], ["d", "e", "f"]]); * ``` * * @example Infer the number of fields from the first row with `fieldsPerRecord: 0` * ```ts * import { parse } from "@std/csv/parse"; * import { assertThrows } from "@std/assert/throws"; * * // Note that the second row has more fields than the first row * const string = "a,b\nc,d,e"; * assertThrows( * () => parse(string, { fieldsPerRecord: 0 }), * SyntaxError, * "Syntax error on line 2: expected 2 fields but got 3", * ); * ``` * * @example Enforce the number of fields for each row with `fieldsPerRecord: 2` * ```ts * import { parse } from "@std/csv/parse"; * import { assertThrows } from "@std/assert/throws"; * * const string = "a,b\nc,d,e"; * assertThrows( * () => parse(string, { fieldsPerRecord: 2 }), * SyntaxError, * "Syntax error on line 2: expected 2 fields but got 3", * ); * ``` * * @typeParam T The options' type for parsing. * @param input The input to parse. * @param options The options for parsing. * @returns If you don't provide `options.skipFirstRow` or `options.columns`, it * returns `string[][]`. If you provide `options.skipFirstRow` or * `options.columns`, it returns `Record[]`. */ export function parse( input: string, options: T, ): ParseResult; export function parse( input: string, options: T = { skipFirstRow: false } as T, ): ParseResult { const parser = new Parser(options); const r = parser.parse(input); if (options.skipFirstRow || options.columns) { let headers: readonly string[] = []; if (options.skipFirstRow) { const head = r.shift(); if (head === undefined) { throw new TypeError("Cannot parse input: headers must be defined"); } headers = head; } if (options.columns) { headers = options.columns; } const zeroBasedFirstLineIndex = options.skipFirstRow ? 1 : 0; return r.map((row, i) => { return convertRowToObject(row, headers, zeroBasedFirstLineIndex + i); }) as ParseResult; } return r as ParseResult; }