mirror of
https://github.com/denoland/std.git
synced 2024-11-21 20:50:22 +00:00
532 lines
16 KiB
TypeScript
532 lines
16 KiB
TypeScript
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
|
|
// This module is browser compatible.
|
|
|
|
import {
|
|
convertRowToObject,
|
|
createBareQuoteErrorMessage,
|
|
createQuoteErrorMessage,
|
|
type ParseResult,
|
|
type ReadOptions,
|
|
type RecordWithColumn,
|
|
} from "./_io.ts";
|
|
import { codePointLength } from "./_shared.ts";
|
|
|
|
export type { ParseResult, RecordWithColumn };
|
|
|
|
const BYTE_ORDER_MARK = "\ufeff";
|
|
|
|
class Parser {
|
|
#input = "";
|
|
#cursor = 0;
|
|
#options: {
|
|
separator: string;
|
|
trimLeadingSpace: boolean;
|
|
comment: string | undefined;
|
|
lazyQuotes: boolean | undefined;
|
|
fieldsPerRecord: number | undefined;
|
|
};
|
|
constructor({
|
|
separator = ",",
|
|
trimLeadingSpace = false,
|
|
comment,
|
|
lazyQuotes,
|
|
fieldsPerRecord,
|
|
}: ReadOptions = {}) {
|
|
this.#options = {
|
|
separator,
|
|
trimLeadingSpace,
|
|
comment,
|
|
lazyQuotes,
|
|
fieldsPerRecord,
|
|
};
|
|
}
|
|
#readLine(): string | null {
|
|
if (this.#isEOF()) return null;
|
|
|
|
let buffer = "";
|
|
let hadNewline = false;
|
|
while (this.#cursor < this.#input.length) {
|
|
if (this.#input.startsWith("\r\n", this.#cursor)) {
|
|
hadNewline = true;
|
|
this.#cursor += 2;
|
|
break;
|
|
}
|
|
if (
|
|
this.#input.startsWith("\n", this.#cursor)
|
|
) {
|
|
hadNewline = true;
|
|
this.#cursor += 1;
|
|
break;
|
|
}
|
|
buffer += this.#input[this.#cursor];
|
|
this.#cursor += 1;
|
|
}
|
|
if (!hadNewline && buffer.endsWith("\r")) {
|
|
buffer = buffer.slice(0, -1);
|
|
}
|
|
|
|
return buffer;
|
|
}
|
|
#isEOF(): boolean {
|
|
return this.#cursor >= this.#input.length;
|
|
}
|
|
#parseRecord(zeroBasedStartLine: number): string[] | null {
|
|
let fullLine = this.#readLine();
|
|
if (fullLine === null) return null;
|
|
if (fullLine.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
let zeroBasedLine = zeroBasedStartLine;
|
|
|
|
// line starting with comment character is ignored
|
|
if (this.#options.comment && fullLine[0] === this.#options.comment) {
|
|
return [];
|
|
}
|
|
|
|
let line = fullLine;
|
|
const quote = '"';
|
|
const quoteLen = quote.length;
|
|
const separatorLen = this.#options.separator.length;
|
|
let recordBuffer = "";
|
|
const fieldIndexes = [] as number[];
|
|
parseField: while (true) {
|
|
if (this.#options.trimLeadingSpace) {
|
|
line = line.trimStart();
|
|
}
|
|
|
|
if (line.length === 0 || !line.startsWith(quote)) {
|
|
// Non-quoted string field
|
|
const i = line.indexOf(this.#options.separator);
|
|
let field = line;
|
|
if (i >= 0) {
|
|
field = field.substring(0, i);
|
|
}
|
|
// Check to make sure a quote does not appear in field.
|
|
if (!this.#options.lazyQuotes) {
|
|
const j = field.indexOf(quote);
|
|
if (j >= 0) {
|
|
const col = codePointLength(
|
|
fullLine.slice(0, fullLine.length - line.slice(j).length),
|
|
);
|
|
throw new SyntaxError(
|
|
createBareQuoteErrorMessage(
|
|
zeroBasedStartLine,
|
|
zeroBasedLine,
|
|
col,
|
|
),
|
|
);
|
|
}
|
|
}
|
|
recordBuffer += field;
|
|
fieldIndexes.push(recordBuffer.length);
|
|
if (i >= 0) {
|
|
line = line.substring(i + separatorLen);
|
|
continue parseField;
|
|
}
|
|
break parseField;
|
|
} else {
|
|
// Quoted string field
|
|
line = line.substring(quoteLen);
|
|
while (true) {
|
|
const i = line.indexOf(quote);
|
|
if (i >= 0) {
|
|
// Hit next quote.
|
|
recordBuffer += line.substring(0, i);
|
|
line = line.substring(i + quoteLen);
|
|
if (line.startsWith(quote)) {
|
|
// `""` sequence (append quote).
|
|
recordBuffer += quote;
|
|
line = line.substring(quoteLen);
|
|
} else if (line.startsWith(this.#options.separator)) {
|
|
// `","` sequence (end of field).
|
|
line = line.substring(separatorLen);
|
|
fieldIndexes.push(recordBuffer.length);
|
|
continue parseField;
|
|
} else if (0 === line.length) {
|
|
// `"\n` sequence (end of line).
|
|
fieldIndexes.push(recordBuffer.length);
|
|
break parseField;
|
|
} else if (this.#options.lazyQuotes) {
|
|
// `"` sequence (bare quote).
|
|
recordBuffer += quote;
|
|
} else {
|
|
// `"*` sequence (invalid non-escaped quote).
|
|
const col = codePointLength(
|
|
fullLine.slice(0, fullLine.length - line.length - quoteLen),
|
|
);
|
|
throw new SyntaxError(
|
|
createQuoteErrorMessage(zeroBasedStartLine, zeroBasedLine, col),
|
|
);
|
|
}
|
|
} else if (line.length > 0 || !(this.#isEOF())) {
|
|
// Hit end of line (copy all data so far).
|
|
recordBuffer += line;
|
|
const r = this.#readLine();
|
|
line = r ?? ""; // This is a workaround for making this module behave similarly to the encoding/csv/reader.go.
|
|
fullLine = line;
|
|
if (r === null) {
|
|
// Abrupt end of file (EOF or error).
|
|
if (!this.#options.lazyQuotes) {
|
|
const col = codePointLength(fullLine);
|
|
throw new SyntaxError(
|
|
createQuoteErrorMessage(
|
|
zeroBasedStartLine,
|
|
zeroBasedLine,
|
|
col,
|
|
),
|
|
);
|
|
}
|
|
fieldIndexes.push(recordBuffer.length);
|
|
break parseField;
|
|
}
|
|
zeroBasedLine++;
|
|
recordBuffer += "\n"; // preserve line feed (This is because TextProtoReader removes it.)
|
|
} else {
|
|
// Abrupt end of file (EOF on error).
|
|
if (!this.#options.lazyQuotes) {
|
|
const col = codePointLength(fullLine);
|
|
throw new SyntaxError(
|
|
createQuoteErrorMessage(zeroBasedStartLine, zeroBasedLine, col),
|
|
);
|
|
}
|
|
fieldIndexes.push(recordBuffer.length);
|
|
break parseField;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
const result = [] as string[];
|
|
let preIdx = 0;
|
|
for (const i of fieldIndexes) {
|
|
result.push(recordBuffer.slice(preIdx, i));
|
|
preIdx = i;
|
|
}
|
|
return result;
|
|
}
|
|
parse(input: string): string[][] {
|
|
this.#input = input.startsWith(BYTE_ORDER_MARK) ? input.slice(1) : input;
|
|
this.#cursor = 0;
|
|
const result: string[][] = [];
|
|
|
|
let lineResult: string[];
|
|
let first = true;
|
|
let lineIndex = 0;
|
|
|
|
const INVALID_RUNE = ["\r", "\n", '"'];
|
|
|
|
const options = this.#options;
|
|
if (
|
|
INVALID_RUNE.includes(options.separator) ||
|
|
(typeof options.comment === "string" &&
|
|
INVALID_RUNE.includes(options.comment)) ||
|
|
options.separator === options.comment
|
|
) {
|
|
throw new Error("Cannot parse input: invalid delimiter");
|
|
}
|
|
|
|
// The number of fields per record that is either inferred from the first
|
|
// row (when options.fieldsPerRecord = 0), or set by the caller (when
|
|
// options.fieldsPerRecord > 0).
|
|
//
|
|
// Each possible variant means the following:
|
|
// "ANY": Variable number of fields is allowed.
|
|
// "UNINITIALIZED": The first row has not been read yet. Once it's read, the
|
|
// number of fields will be set.
|
|
// <number>: The number of fields per record that every record must follow.
|
|
let _nbFields: "ANY" | "UNINITIALIZED" | number;
|
|
if (options.fieldsPerRecord === undefined || options.fieldsPerRecord < 0) {
|
|
_nbFields = "ANY";
|
|
} else if (options.fieldsPerRecord === 0) {
|
|
_nbFields = "UNINITIALIZED";
|
|
} else {
|
|
// TODO: Should we check if it's a valid integer?
|
|
_nbFields = options.fieldsPerRecord;
|
|
}
|
|
|
|
while (true) {
|
|
const r = this.#parseRecord(lineIndex);
|
|
if (r === null) break;
|
|
lineResult = r;
|
|
lineIndex++;
|
|
// If fieldsPerRecord is 0, Read sets it to
|
|
// the number of fields in the first record
|
|
if (first) {
|
|
first = false;
|
|
if (_nbFields === "UNINITIALIZED") {
|
|
_nbFields = lineResult.length;
|
|
}
|
|
}
|
|
|
|
if (lineResult.length > 0) {
|
|
if (typeof _nbFields === "number" && _nbFields !== lineResult.length) {
|
|
throw new SyntaxError(
|
|
`Syntax error on line ${lineIndex}: expected ${_nbFields} fields but got ${lineResult.length}`,
|
|
);
|
|
}
|
|
result.push(lineResult);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
}
|
|
|
|
/** Options for {@linkcode parse}. */
|
|
export interface ParseOptions {
|
|
/** Character which separates values.
|
|
*
|
|
* @default {","}
|
|
*/
|
|
separator?: string;
|
|
/** Character to start a comment.
|
|
*
|
|
* Lines beginning with the comment character without preceding whitespace
|
|
* are ignored. With leading whitespace the comment character becomes part of
|
|
* the field, even you provide `trimLeadingSpace: true`.
|
|
*
|
|
* By default, no character is considered to be a start of a comment.
|
|
*/
|
|
comment?: string;
|
|
/** Flag to trim the leading space of the value.
|
|
*
|
|
* This is done even if the field delimiter, `separator`, is white space.
|
|
*
|
|
* @default {false}
|
|
*/
|
|
trimLeadingSpace?: boolean;
|
|
/**
|
|
* Allow unquoted quote in a quoted field or non-double-quoted quotes in
|
|
* quoted field.
|
|
*
|
|
* @default {false}
|
|
*/
|
|
lazyQuotes?: boolean;
|
|
/**
|
|
* Enabling checking number of expected fields for each row.
|
|
*
|
|
* If positive, each record is required to have the given number of fields.
|
|
* If 0, it will be set to the number of fields in the first row, so that
|
|
* future rows must have the same field count.
|
|
* If negative, no check is made and records may have a variable number of
|
|
* fields.
|
|
*
|
|
* If the wrong number of fields is in a row, a {@linkcode SyntaxError} is
|
|
* thrown.
|
|
*/
|
|
fieldsPerRecord?: number;
|
|
/**
|
|
* If you provide `skipFirstRow: true` and `columns`, the first line will be
|
|
* skipped.
|
|
* If you provide `skipFirstRow: true` but not `columns`, the first line will
|
|
* be skipped and used as header definitions.
|
|
*
|
|
* @default {false}
|
|
*/
|
|
skipFirstRow?: boolean;
|
|
|
|
/** List of names used for header definition. */
|
|
columns?: readonly string[];
|
|
}
|
|
|
|
/**
|
|
* Parses CSV string into an array of arrays of strings.
|
|
*
|
|
* @example Usage
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
*
|
|
* const string = "a,b,c\n#d,e,f";
|
|
*
|
|
* assertEquals(parse(string), [["a", "b", "c"], ["#d", "e", "f"]]);
|
|
* ```
|
|
*
|
|
* @example Quoted fields
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
*
|
|
* const string = `"a ""word""","comma,","newline\n"\nfoo,bar,baz`;
|
|
* const result = parse(string);
|
|
*
|
|
* assertEquals(result, [
|
|
* ['a "word"', "comma,", "newline\n"],
|
|
* ["foo", "bar", "baz"]
|
|
* ]);
|
|
* ```
|
|
*
|
|
* @param input The input to parse.
|
|
* @returns The parsed data.
|
|
*/
|
|
export function parse(input: string): string[][];
|
|
/**
|
|
* Parses CSV string into an array of objects or an array of arrays of strings.
|
|
*
|
|
* If `columns` or `skipFirstRow` option is provided, it returns an array of
|
|
* objects, otherwise it returns an array of arrays of string.
|
|
*
|
|
* @example Don't skip first row with `skipFirstRow: false`
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
* import { assertType, IsExact } from "@std/testing/types"
|
|
*
|
|
* const string = "a,b,c\nd,e,f";
|
|
* const result = parse(string, { skipFirstRow: false });
|
|
*
|
|
* assertEquals(result, [["a", "b", "c"], ["d", "e", "f"]]);
|
|
* assertType<IsExact<typeof result, string[][]>>(true);
|
|
* ```
|
|
*
|
|
* @example Skip first row with `skipFirstRow: true`
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
* import { assertType, IsExact } from "@std/testing/types"
|
|
*
|
|
* const string = "a,b,c\nd,e,f";
|
|
* const result = parse(string, { skipFirstRow: true });
|
|
*
|
|
* assertEquals(result, [{ a: "d", b: "e", c: "f" }]);
|
|
* assertType<IsExact<typeof result, Record<string, string>[]>>(true);
|
|
* ```
|
|
*
|
|
* @example Specify columns with `columns` option
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
* import { assertType, IsExact } from "@std/testing/types"
|
|
*
|
|
* const string = "a,b,c\nd,e,f";
|
|
* const result = parse(string, { columns: ["x", "y", "z"] });
|
|
*
|
|
* assertEquals(result, [{ x: "a", y: "b", z: "c" }, { x: "d", y: "e", z: "f" }]);
|
|
* assertType<IsExact<typeof result, Record<"x" | "y" | "z", string>[]>>(true);
|
|
* ```
|
|
*
|
|
* @example Specify columns with `columns` option and skip first row with
|
|
* `skipFirstRow: true`
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
* import { assertType, IsExact } from "@std/testing/types"
|
|
*
|
|
* const string = "a,b,c\nd,e,f";
|
|
* const result = parse(string, { columns: ["x", "y", "z"], skipFirstRow: true });
|
|
*
|
|
* assertEquals(result, [{ x: "d", y: "e", z: "f" }]);
|
|
* assertType<IsExact<typeof result, Record<"x" | "y" | "z", string>[]>>(true);
|
|
* ```
|
|
*
|
|
* @example TSV (tab-separated values) with `separator: "\t"`
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
*
|
|
* const string = "a\tb\tc\nd\te\tf";
|
|
* const result = parse(string, { separator: "\t" });
|
|
*
|
|
* assertEquals(result, [["a", "b", "c"], ["d", "e", "f"]]);
|
|
* ```
|
|
*
|
|
* @example Trim leading space with `trimLeadingSpace: true`
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
*
|
|
* const string = " a, b, c\n";
|
|
* const result = parse(string, { trimLeadingSpace: true });
|
|
*
|
|
* assertEquals(result, [["a", "b", "c"]]);
|
|
* ```
|
|
*
|
|
* @example Lazy quotes with `lazyQuotes: true`
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
*
|
|
* const string = `a "word","1"2",a","b`;
|
|
* const result = parse(string, { lazyQuotes: true });
|
|
*
|
|
* assertEquals(result, [['a "word"', '1"2', 'a"', 'b']]);
|
|
* ```
|
|
*
|
|
* @example Set comment prefix with `comment` option
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertEquals } from "@std/assert/equals";
|
|
*
|
|
* const string = "a,b,c\n# THIS IS A COMMENT LINE\nd,e,f";
|
|
* const result = parse(string, { comment: "#" });
|
|
*
|
|
* assertEquals(result, [["a", "b", "c"], ["d", "e", "f"]]);
|
|
* ```
|
|
*
|
|
* @example Infer the number of fields from the first row with `fieldsPerRecord: 0`
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertThrows } from "@std/assert/throws";
|
|
*
|
|
* // Note that the second row has more fields than the first row
|
|
* const string = "a,b\nc,d,e";
|
|
* assertThrows(
|
|
* () => parse(string, { fieldsPerRecord: 0 }),
|
|
* SyntaxError,
|
|
* "Syntax error on line 2: expected 2 fields but got 3",
|
|
* );
|
|
* ```
|
|
*
|
|
* @example Enforce the number of fields for each row with `fieldsPerRecord: 2`
|
|
* ```ts
|
|
* import { parse } from "@std/csv/parse";
|
|
* import { assertThrows } from "@std/assert/throws";
|
|
*
|
|
* const string = "a,b\nc,d,e";
|
|
* assertThrows(
|
|
* () => parse(string, { fieldsPerRecord: 2 }),
|
|
* SyntaxError,
|
|
* "Syntax error on line 2: expected 2 fields but got 3",
|
|
* );
|
|
* ```
|
|
*
|
|
* @typeParam T The options' type for parsing.
|
|
* @param input The input to parse.
|
|
* @param options The options for parsing.
|
|
* @returns If you don't provide `options.skipFirstRow` or `options.columns`, it
|
|
* returns `string[][]`. If you provide `options.skipFirstRow` or
|
|
* `options.columns`, it returns `Record<string, string>[]`.
|
|
*/
|
|
export function parse<const T extends ParseOptions>(
|
|
input: string,
|
|
options: T,
|
|
): ParseResult<ParseOptions, T>;
|
|
export function parse<const T extends ParseOptions>(
|
|
input: string,
|
|
options: T = { skipFirstRow: false } as T,
|
|
): ParseResult<ParseOptions, T> {
|
|
const parser = new Parser(options);
|
|
const r = parser.parse(input);
|
|
|
|
if (options.skipFirstRow || options.columns) {
|
|
let headers: readonly string[] = [];
|
|
|
|
if (options.skipFirstRow) {
|
|
const head = r.shift();
|
|
if (head === undefined) {
|
|
throw new TypeError("Cannot parse input: headers must be defined");
|
|
}
|
|
headers = head;
|
|
}
|
|
|
|
if (options.columns) {
|
|
headers = options.columns;
|
|
}
|
|
|
|
const zeroBasedFirstLineIndex = options.skipFirstRow ? 1 : 0;
|
|
return r.map((row, i) => {
|
|
return convertRowToObject(row, headers, zeroBasedFirstLineIndex + i);
|
|
}) as ParseResult<ParseOptions, T>;
|
|
}
|
|
return r as ParseResult<ParseOptions, T>;
|
|
}
|