feat(csv): support fieldsPerRecord in CsvParseStream (#5600)

Although the constructor of `CsvParseStream` accepts `fieldsPerRecord` option
(see https://jsr.io/@std/csv@1.0.0-rc.5/doc/~/CsvParseStreamOptions) that
ensures that every record has the specified (or inferred from the first row)
number of fields, this option has no effect at all in the current
implementation. To fix this issue, this patch implements the `fieldsPerRecord`
logic in `CsvParseStream` together with sufficient amount of test cases.
This commit is contained in:
Yusuke Tanaka 2024-08-01 19:48:14 +09:00 committed by GitHub
parent 577fd9a6e8
commit b85d219148
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 115 additions and 16 deletions

View File

@ -135,6 +135,17 @@ export class CsvParseStream<
#lineIndex = 0;
#isFirstRow = true;
// The number of fields per record that is either inferred from the first row
// (when options.fieldsPerRecord = 0), or set by the caller (when
// options.fieldsPerRecord > 0).
//
// Each possible variant means the following:
// "ANY": Variable number of fields is allowed.
// "UNINITIALIZED": The first row has not been read yet. Once it's read, the
// number of fields will be set.
// <number>: The number of fields per record that every record must follow.
#fieldsPerRecord: "ANY" | "UNINITIALIZED" | number;
#headers: readonly string[] = [];
/** Construct a new instance.
@ -147,6 +158,18 @@ export class CsvParseStream<
...options,
};
if (
this.#options.fieldsPerRecord === undefined ||
this.#options.fieldsPerRecord < 0
) {
this.#fieldsPerRecord = "ANY";
} else if (this.#options.fieldsPerRecord === 0) {
this.#fieldsPerRecord = "UNINITIALIZED";
} else {
// TODO: Should we check if it's a valid integer?
this.#fieldsPerRecord = this.#options.fieldsPerRecord;
}
this.#lines = new TextDelimiterStream("\n");
this.#lineReader = new StreamLineReader(this.#lines.readable.getReader());
this.#readable = new ReadableStream({
@ -198,6 +221,21 @@ export class CsvParseStream<
if (this.#options.skipFirstRow) {
return this.#pull(controller);
}
if (this.#fieldsPerRecord === "UNINITIALIZED") {
this.#fieldsPerRecord = record.length;
}
}
if (
typeof this.#fieldsPerRecord === "number" &&
record.length !== this.#fieldsPerRecord
) {
throw new SyntaxError(
`record on line ${
this.#lineIndex + 1
}: expected ${this.#fieldsPerRecord} fields but got ${record.length}`,
);
}
this.#lineIndex++;

View File

@ -1,7 +1,7 @@
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
import { CsvParseStream } from "./parse_stream.ts";
import type { CsvParseStreamOptions } from "./parse_stream.ts";
import { assertEquals, assertRejects } from "@std/assert";
import { assert, assertEquals, assertRejects } from "@std/assert";
import type { AssertTrue, IsExact } from "@std/testing/types";
import { fromFileUrl, join } from "@std/path";
import { delay } from "@std/async/delay";
@ -83,8 +83,11 @@ Deno.test({
{
name: "Separator is undefined",
input: "a;b;c\n",
errorMessage: "Separator is required",
separator: undefined,
error: {
klass: TypeError,
msg: "Separator is required",
},
},
{
name: "MultiLine",
@ -128,13 +131,52 @@ field"`,
],
},
{
name: "FieldCount",
name: "fieldsPerRecord - variable number of fields is allowed",
input: "a,b,c\nd,e",
output: [
["a", "b", "c"],
["d", "e"],
],
},
{
name: "fieldsPerRecord = -42 - variable number of fields is allowed",
input: "a,b,c\nd,e",
output: [
["a", "b", "c"],
["d", "e"],
],
fieldsPerRecord: -42,
},
{
name:
"fieldsPerRecord = 0 - the number of fields is inferred from the first row",
input: "a,b,c\nd,e,f",
output: [
["a", "b", "c"],
["d", "e", "f"],
],
fieldsPerRecord: 0,
},
{
name:
"fieldsPerRecord = 0 - inferred number of fields does not match subsequent rows",
input: "a,b,c\nd,e",
fieldsPerRecord: 0,
error: {
klass: SyntaxError,
msg: "record on line 2: expected 3 fields but got 2",
},
},
{
name:
"fieldsPerRecord = 3 - SyntaxError is thrown when the number of fields is not 2",
input: "a,b,c\nd,e",
fieldsPerRecord: 3,
error: {
klass: SyntaxError,
msg: "record on line 2: expected 3 fields but got 2",
},
},
{
name: "TrailingCommaEOF",
input: "a,b,c,",
@ -312,18 +354,29 @@ x,,,
input: "a,b,c\nd,e",
skipFirstRow: true,
columns: ["foo", "bar", "baz"],
errorMessage:
"Error number of fields line: 1\nNumber of fields found: 3\nExpected number of fields: 2",
error: {
klass: Error,
msg:
"Error number of fields line: 1\nNumber of fields found: 3\nExpected number of fields: 2",
},
},
{
name: "bad quote in bare field",
input: `a "word",1,2,3`,
errorMessage: "Error line: 1\nBad quoting",
error: {
klass: SyntaxError,
msg:
'record on line 1; parse error on line 0, column 2: bare " in non-quoted-field',
},
},
{
name: "bad quote in quoted field",
input: `"wo"rd",1,2,3`,
errorMessage: "Error line: 1\nBad quoting",
error: {
klass: SyntaxError,
msg:
'record on line 1; parse error on line 0, column 3: extraneous or missing " in quoted-field',
},
},
{
name: "lazy quote",
@ -341,18 +394,21 @@ x,,,
if ("comment" in testCase) {
options.comment = testCase.comment;
}
if ("skipFirstRow" in testCase) {
options.skipFirstRow = testCase.skipFirstRow;
}
if ("columns" in testCase) {
options.columns = testCase.columns;
}
if ("trimLeadingSpace" in testCase) {
options.trimLeadingSpace = testCase.trimLeadingSpace;
}
if ("lazyQuotes" in testCase) {
options.lazyQuotes = testCase.lazyQuotes;
}
if ("fieldsPerRecord" in testCase) {
options.fieldsPerRecord = testCase.fieldsPerRecord;
}
if ("skipFirstRow" in testCase) {
options.skipFirstRow = testCase.skipFirstRow;
}
if ("columns" in testCase) {
options.columns = testCase.columns;
}
const readable = ReadableStream.from(testCase.input)
.pipeThrough(new CsvParseStream(options));
@ -361,9 +417,14 @@ x,,,
const actual = await Array.fromAsync(readable);
assertEquals(actual, testCase.output);
} else {
await assertRejects(async () => {
for await (const _ of readable);
}, testCase.errorMessage);
assert(testCase.error);
await assertRejects(
async () => {
for await (const _ of readable);
},
testCase.error.klass,
testCase.error.msg,
);
}
});
}