From b85d219148276a2667fedb85966996ad0093bd93 Mon Sep 17 00:00:00 2001 From: Yusuke Tanaka Date: Thu, 1 Aug 2024 19:48:14 +0900 Subject: [PATCH] feat(csv): support `fieldsPerRecord` in `CsvParseStream` (#5600) Although the constructor of `CsvParseStream` accepts `fieldsPerRecord` option (see https://jsr.io/@std/csv@1.0.0-rc.5/doc/~/CsvParseStreamOptions) that ensures that every record has the specified (or inferred from the first row) number of fields, this option has no effect at all in the current implementation. To fix this issue, this patch implements the `fieldsPerRecord` logic in `CsvParseStream` together with sufficient amount of test cases. --- csv/parse_stream.ts | 38 ++++++++++++++++ csv/parse_stream_test.ts | 93 +++++++++++++++++++++++++++++++++------- 2 files changed, 115 insertions(+), 16 deletions(-) diff --git a/csv/parse_stream.ts b/csv/parse_stream.ts index 40748b128..b255bdcaa 100644 --- a/csv/parse_stream.ts +++ b/csv/parse_stream.ts @@ -135,6 +135,17 @@ export class CsvParseStream< #lineIndex = 0; #isFirstRow = true; + // The number of fields per record that is either inferred from the first row + // (when options.fieldsPerRecord = 0), or set by the caller (when + // options.fieldsPerRecord > 0). + // + // Each possible variant means the following: + // "ANY": Variable number of fields is allowed. + // "UNINITIALIZED": The first row has not been read yet. Once it's read, the + // number of fields will be set. + // : The number of fields per record that every record must follow. + #fieldsPerRecord: "ANY" | "UNINITIALIZED" | number; + #headers: readonly string[] = []; /** Construct a new instance. @@ -147,6 +158,18 @@ export class CsvParseStream< ...options, }; + if ( + this.#options.fieldsPerRecord === undefined || + this.#options.fieldsPerRecord < 0 + ) { + this.#fieldsPerRecord = "ANY"; + } else if (this.#options.fieldsPerRecord === 0) { + this.#fieldsPerRecord = "UNINITIALIZED"; + } else { + // TODO: Should we check if it's a valid integer? + this.#fieldsPerRecord = this.#options.fieldsPerRecord; + } + this.#lines = new TextDelimiterStream("\n"); this.#lineReader = new StreamLineReader(this.#lines.readable.getReader()); this.#readable = new ReadableStream({ @@ -198,6 +221,21 @@ export class CsvParseStream< if (this.#options.skipFirstRow) { return this.#pull(controller); } + + if (this.#fieldsPerRecord === "UNINITIALIZED") { + this.#fieldsPerRecord = record.length; + } + } + + if ( + typeof this.#fieldsPerRecord === "number" && + record.length !== this.#fieldsPerRecord + ) { + throw new SyntaxError( + `record on line ${ + this.#lineIndex + 1 + }: expected ${this.#fieldsPerRecord} fields but got ${record.length}`, + ); } this.#lineIndex++; diff --git a/csv/parse_stream_test.ts b/csv/parse_stream_test.ts index 087abc8b4..6feaf662c 100644 --- a/csv/parse_stream_test.ts +++ b/csv/parse_stream_test.ts @@ -1,7 +1,7 @@ // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. import { CsvParseStream } from "./parse_stream.ts"; import type { CsvParseStreamOptions } from "./parse_stream.ts"; -import { assertEquals, assertRejects } from "@std/assert"; +import { assert, assertEquals, assertRejects } from "@std/assert"; import type { AssertTrue, IsExact } from "@std/testing/types"; import { fromFileUrl, join } from "@std/path"; import { delay } from "@std/async/delay"; @@ -83,8 +83,11 @@ Deno.test({ { name: "Separator is undefined", input: "a;b;c\n", - errorMessage: "Separator is required", separator: undefined, + error: { + klass: TypeError, + msg: "Separator is required", + }, }, { name: "MultiLine", @@ -128,13 +131,52 @@ field"`, ], }, { - name: "FieldCount", + name: "fieldsPerRecord - variable number of fields is allowed", input: "a,b,c\nd,e", output: [ ["a", "b", "c"], ["d", "e"], ], }, + { + name: "fieldsPerRecord = -42 - variable number of fields is allowed", + input: "a,b,c\nd,e", + output: [ + ["a", "b", "c"], + ["d", "e"], + ], + fieldsPerRecord: -42, + }, + { + name: + "fieldsPerRecord = 0 - the number of fields is inferred from the first row", + input: "a,b,c\nd,e,f", + output: [ + ["a", "b", "c"], + ["d", "e", "f"], + ], + fieldsPerRecord: 0, + }, + { + name: + "fieldsPerRecord = 0 - inferred number of fields does not match subsequent rows", + input: "a,b,c\nd,e", + fieldsPerRecord: 0, + error: { + klass: SyntaxError, + msg: "record on line 2: expected 3 fields but got 2", + }, + }, + { + name: + "fieldsPerRecord = 3 - SyntaxError is thrown when the number of fields is not 2", + input: "a,b,c\nd,e", + fieldsPerRecord: 3, + error: { + klass: SyntaxError, + msg: "record on line 2: expected 3 fields but got 2", + }, + }, { name: "TrailingCommaEOF", input: "a,b,c,", @@ -312,18 +354,29 @@ x,,, input: "a,b,c\nd,e", skipFirstRow: true, columns: ["foo", "bar", "baz"], - errorMessage: - "Error number of fields line: 1\nNumber of fields found: 3\nExpected number of fields: 2", + error: { + klass: Error, + msg: + "Error number of fields line: 1\nNumber of fields found: 3\nExpected number of fields: 2", + }, }, { name: "bad quote in bare field", input: `a "word",1,2,3`, - errorMessage: "Error line: 1\nBad quoting", + error: { + klass: SyntaxError, + msg: + 'record on line 1; parse error on line 0, column 2: bare " in non-quoted-field', + }, }, { name: "bad quote in quoted field", input: `"wo"rd",1,2,3`, - errorMessage: "Error line: 1\nBad quoting", + error: { + klass: SyntaxError, + msg: + 'record on line 1; parse error on line 0, column 3: extraneous or missing " in quoted-field', + }, }, { name: "lazy quote", @@ -341,18 +394,21 @@ x,,, if ("comment" in testCase) { options.comment = testCase.comment; } - if ("skipFirstRow" in testCase) { - options.skipFirstRow = testCase.skipFirstRow; - } - if ("columns" in testCase) { - options.columns = testCase.columns; - } if ("trimLeadingSpace" in testCase) { options.trimLeadingSpace = testCase.trimLeadingSpace; } if ("lazyQuotes" in testCase) { options.lazyQuotes = testCase.lazyQuotes; } + if ("fieldsPerRecord" in testCase) { + options.fieldsPerRecord = testCase.fieldsPerRecord; + } + if ("skipFirstRow" in testCase) { + options.skipFirstRow = testCase.skipFirstRow; + } + if ("columns" in testCase) { + options.columns = testCase.columns; + } const readable = ReadableStream.from(testCase.input) .pipeThrough(new CsvParseStream(options)); @@ -361,9 +417,14 @@ x,,, const actual = await Array.fromAsync(readable); assertEquals(actual, testCase.output); } else { - await assertRejects(async () => { - for await (const _ of readable); - }, testCase.errorMessage); + assert(testCase.error); + await assertRejects( + async () => { + for await (const _ of readable); + }, + testCase.error.klass, + testCase.error.msg, + ); } }); }