feat(csv): support fieldsPerRecord in CsvParseStream (#5600)

Although the constructor of `CsvParseStream` accepts `fieldsPerRecord` option (see https://jsr.io/@std/csv@1.0.0-rc.5/doc/~/CsvParseStreamOptions) that ensures that every record has the specified (or inferred from the first row) number of fields, this option has no effect at all in the current implementation. To fix this issue, this patch implements the `fieldsPerRecord` logic in `CsvParseStream` together with sufficient amount of test cases.
2024-11-21 20:50:22 +00:00 · 2024-08-01 19:48:14 +09:00 · 2024-08-01 19:48:14 +09:00 · b85d219148
commit b85d219148
parent 577fd9a6e8
2 changed files with 115 additions and 16 deletions
--- a/csv/parse_stream.ts
+++ b/csv/parse_stream.ts
@ -135,6 +135,17 @@ export class CsvParseStream<
  #lineIndex = 0;
  #isFirstRow = true;

+  // The number of fields per record that is either inferred from the first row
+  // (when options.fieldsPerRecord = 0), or set by the caller (when
+  // options.fieldsPerRecord > 0).
+  //
+  // Each possible variant means the following:
+  // "ANY": Variable number of fields is allowed.
+  // "UNINITIALIZED": The first row has not been read yet. Once it's read, the
+  //                  number of fields will be set.
+  // <number>: The number of fields per record that every record must follow.
+  #fieldsPerRecord: "ANY" | "UNINITIALIZED" | number;
+
  #headers: readonly string[] = [];

  /** Construct a new instance.
@ -147,6 +158,18 @@ export class CsvParseStream<
      ...options,
    };

+    if (
+      this.#options.fieldsPerRecord === undefined ||
+      this.#options.fieldsPerRecord < 0
+    ) {
+      this.#fieldsPerRecord = "ANY";
+    } else if (this.#options.fieldsPerRecord === 0) {
+      this.#fieldsPerRecord = "UNINITIALIZED";
+    } else {
+      // TODO: Should we check if it's a valid integer?
+      this.#fieldsPerRecord = this.#options.fieldsPerRecord;
+    }
+
    this.#lines = new TextDelimiterStream("\n");
    this.#lineReader = new StreamLineReader(this.#lines.readable.getReader());
    this.#readable = new ReadableStream({
@ -198,6 +221,21 @@ export class CsvParseStream<
      if (this.#options.skipFirstRow) {
        return this.#pull(controller);
      }
+
+      if (this.#fieldsPerRecord === "UNINITIALIZED") {
+        this.#fieldsPerRecord = record.length;
+      }
+    }
+
+    if (
+      typeof this.#fieldsPerRecord === "number" &&
+      record.length !== this.#fieldsPerRecord
+    ) {
+      throw new SyntaxError(
+        `record on line ${
+          this.#lineIndex + 1
+        }: expected ${this.#fieldsPerRecord} fields but got ${record.length}`,
+      );
    }

    this.#lineIndex++;
--- a/csv/parse_stream_test.ts
+++ b/csv/parse_stream_test.ts
@ -1,7 +1,7 @@
 // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
 import { CsvParseStream } from "./parse_stream.ts";
 import type { CsvParseStreamOptions } from "./parse_stream.ts";
-import { assertEquals, assertRejects } from "@std/assert";
+import { assert, assertEquals, assertRejects } from "@std/assert";
 import type { AssertTrue, IsExact } from "@std/testing/types";
 import { fromFileUrl, join } from "@std/path";
 import { delay } from "@std/async/delay";
@ -83,8 +83,11 @@ Deno.test({
      {
        name: "Separator is undefined",
        input: "a;b;c\n",
-        errorMessage: "Separator is required",
        separator: undefined,
+        error: {
+          klass: TypeError,
+          msg: "Separator is required",
+        },
      },
      {
        name: "MultiLine",
@ -128,13 +131,52 @@ field"`,
        ],
      },
      {
-        name: "FieldCount",
+        name: "fieldsPerRecord - variable number of fields is allowed",
        input: "a,b,c\nd,e",
        output: [
          ["a", "b", "c"],
          ["d", "e"],
        ],
      },
+      {
+        name: "fieldsPerRecord = -42 - variable number of fields is allowed",
+        input: "a,b,c\nd,e",
+        output: [
+          ["a", "b", "c"],
+          ["d", "e"],
+        ],
+        fieldsPerRecord: -42,
+      },
+      {
+        name:
+          "fieldsPerRecord = 0 - the number of fields is inferred from the first row",
+        input: "a,b,c\nd,e,f",
+        output: [
+          ["a", "b", "c"],
+          ["d", "e", "f"],
+        ],
+        fieldsPerRecord: 0,
+      },
+      {
+        name:
+          "fieldsPerRecord = 0 - inferred number of fields does not match subsequent rows",
+        input: "a,b,c\nd,e",
+        fieldsPerRecord: 0,
+        error: {
+          klass: SyntaxError,
+          msg: "record on line 2: expected 3 fields but got 2",
+        },
+      },
+      {
+        name:
+          "fieldsPerRecord = 3 - SyntaxError is thrown when the number of fields is not 2",
+        input: "a,b,c\nd,e",
+        fieldsPerRecord: 3,
+        error: {
+          klass: SyntaxError,
+          msg: "record on line 2: expected 3 fields but got 2",
+        },
+      },
      {
        name: "TrailingCommaEOF",
        input: "a,b,c,",
@ -312,18 +354,29 @@ x,,,
        input: "a,b,c\nd,e",
        skipFirstRow: true,
        columns: ["foo", "bar", "baz"],
-        errorMessage:
-          "Error number of fields line: 1\nNumber of fields found: 3\nExpected number of fields: 2",
+        error: {
+          klass: Error,
+          msg:
+            "Error number of fields line: 1\nNumber of fields found: 3\nExpected number of fields: 2",
+        },
      },
      {
        name: "bad quote in bare field",
        input: `a "word",1,2,3`,
-        errorMessage: "Error line: 1\nBad quoting",
+        error: {
+          klass: SyntaxError,
+          msg:
+            'record on line 1; parse error on line 0, column 2: bare " in non-quoted-field',
+        },
      },
      {
        name: "bad quote in quoted field",
        input: `"wo"rd",1,2,3`,
-        errorMessage: "Error line: 1\nBad quoting",
+        error: {
+          klass: SyntaxError,
+          msg:
+            'record on line 1; parse error on line 0, column 3: extraneous or missing " in quoted-field',
+        },
      },
      {
        name: "lazy quote",
@ -341,18 +394,21 @@ x,,,
        if ("comment" in testCase) {
          options.comment = testCase.comment;
        }
-        if ("skipFirstRow" in testCase) {
-          options.skipFirstRow = testCase.skipFirstRow;
-        }
-        if ("columns" in testCase) {
-          options.columns = testCase.columns;
-        }
        if ("trimLeadingSpace" in testCase) {
          options.trimLeadingSpace = testCase.trimLeadingSpace;
        }
        if ("lazyQuotes" in testCase) {
          options.lazyQuotes = testCase.lazyQuotes;
        }
+        if ("fieldsPerRecord" in testCase) {
+          options.fieldsPerRecord = testCase.fieldsPerRecord;
+        }
+        if ("skipFirstRow" in testCase) {
+          options.skipFirstRow = testCase.skipFirstRow;
+        }
+        if ("columns" in testCase) {
+          options.columns = testCase.columns;
+        }

        const readable = ReadableStream.from(testCase.input)
          .pipeThrough(new CsvParseStream(options));
@ -361,9 +417,14 @@ x,,,
          const actual = await Array.fromAsync(readable);
          assertEquals(actual, testCase.output);
        } else {
-          await assertRejects(async () => {
-            for await (const _ of readable);
-          }, testCase.errorMessage);
+          assert(testCase.error);
+          await assertRejects(
+            async () => {
+              for await (const _ of readable);
+            },
+            testCase.error.klass,
+            testCase.error.msg,
+          );
        }
      });
    }