std/encoding/csv_test.ts

// Test ported from Golang
// https://github.com/golang/go/blob/2cc15b1/src/encoding/csv/reader_test.go
// Copyright 2011 The Go Authors. All rights reserved. BSD license.
// https://github.com/golang/go/blob/master/LICENSE
// Copyright 2018-2021 the Deno authors. All rights reserved. MIT license.

import { assertEquals, assertThrowsAsync } from "../testing/asserts.ts";
import {
  ERR_BARE_QUOTE,
  ERR_FIELD_COUNT,
  ERR_INVALID_DELIM,
  ERR_QUOTE,
  parse,
  ParseError,
  readMatrix,
} from "./csv.ts";
import { StringReader } from "../io/readers.ts";
import { BufReader } from "../io/bufio.ts";

// Test cases for `readMatrix()`
const testCases = [
  {
    Name: "Simple",
    Input: "a,b,c\n",
    Output: [["a", "b", "c"]],
  },
  {
    Name: "CRLF",
    Input: "a,b\r\nc,d\r\n",
    Output: [
      ["a", "b"],
      ["c", "d"],
    ],
  },
  {
    Name: "BareCR",
    Input: "a,b\rc,d\r\n",
    Output: [["a", "b\rc", "d"]],
  },
  {
    Name: "RFC4180test",
    Input: `#field1,field2,field3
"aaa","bbb","ccc"
"a,a","bbb","ccc"
zzz,yyy,xxx`,
    UseFieldsPerRecord: true,
    FieldsPerRecord: 0,
    Output: [
      ["#field1", "field2", "field3"],
      ["aaa", "bbb", "ccc"],
      ["a,a", `bbb`, "ccc"],
      ["zzz", "yyy", "xxx"],
    ],
  },
  {
    Name: "NoEOLTest",
    Input: "a,b,c",
    Output: [["a", "b", "c"]],
  },
  {
    Name: "Semicolon",
    Input: "a;b;c\n",
    Output: [["a", "b", "c"]],
    Separator: ";",
  },
  {
    Name: "MultiLine",
    Input: `"two
line","one line","three
line
field"`,
    Output: [["two\nline", "one line", "three\nline\nfield"]],
  },
  {
    Name: "BlankLine",
    Input: "a,b,c\n\nd,e,f\n\n",
    Output: [
      ["a", "b", "c"],
      ["d", "e", "f"],
    ],
  },
  {
    Name: "BlankLineFieldCount",
    Input: "a,b,c\n\nd,e,f\n\n",
    Output: [
      ["a", "b", "c"],
      ["d", "e", "f"],
    ],
    UseFieldsPerRecord: true,
    FieldsPerRecord: 0,
  },
  {
    Name: "TrimSpace",
    Input: " a,  b,   c\n",
    Output: [["a", "b", "c"]],
    TrimLeadingSpace: true,
  },
  {
    Name: "LeadingSpace",
    Input: " a,  b,   c\n",
    Output: [[" a", "  b", "   c"]],
  },
  {
    Name: "Comment",
    Input: "#1,2,3\na,b,c\n#comment",
    Output: [["a", "b", "c"]],
    Comment: "#",
  },
  {
    Name: "NoComment",
    Input: "#1,2,3\na,b,c",
    Output: [
      ["#1", "2", "3"],
      ["a", "b", "c"],
    ],
  },
  {
    Name: "LazyQuotes",
    Input: `a "word","1"2",a","b`,
    Output: [[`a "word"`, `1"2`, `a"`, `b`]],
    LazyQuotes: true,
  },
  {
    Name: "BareQuotes",
    Input: `a "word","1"2",a"`,
    Output: [[`a "word"`, `1"2`, `a"`]],
    LazyQuotes: true,
  },
  {
    Name: "BareDoubleQuotes",
    Input: `a""b,c`,
    Output: [[`a""b`, `c`]],
    LazyQuotes: true,
  },
  {
    Name: "BadDoubleQuotes",
    Input: `a""b,c`,
    Error: new ParseError(1, 1, 1, ERR_BARE_QUOTE),
  },
  {
    Name: "TrimQuote",
    Input: ` "a"," b",c`,
    Output: [["a", " b", "c"]],
    TrimLeadingSpace: true,
  },
  {
    Name: "BadBareQuote",
    Input: `a "word","b"`,
    Error: new ParseError(1, 1, 2, ERR_BARE_QUOTE),
  },
  {
    Name: "BadTrailingQuote",
    Input: `"a word",b"`,
    Error: new ParseError(1, 1, 10, ERR_BARE_QUOTE),
  },
  {
    Name: "ExtraneousQuote",
    Input: `"a "word","b"`,
    Error: new ParseError(1, 1, 3, ERR_QUOTE),
  },
  {
    Name: "BadFieldCount",
    Input: "a,b,c\nd,e",
    Error: new ParseError(2, 2, null, ERR_FIELD_COUNT),
    UseFieldsPerRecord: true,
    FieldsPerRecord: 0,
  },
  {
    Name: "BadFieldCount1",
    Input: `a,b,c`,
    UseFieldsPerRecord: true,
    FieldsPerRecord: 2,
    Error: new ParseError(1, 1, null, ERR_FIELD_COUNT),
  },
  {
    Name: "FieldCount",
    Input: "a,b,c\nd,e",
    Output: [
      ["a", "b", "c"],
      ["d", "e"],
    ],
  },
  {
    Name: "TrailingCommaEOF",
    Input: "a,b,c,",
    Output: [["a", "b", "c", ""]],
  },
  {
    Name: "TrailingCommaEOL",
    Input: "a,b,c,\n",
    Output: [["a", "b", "c", ""]],
  },
  {
    Name: "TrailingCommaSpaceEOF",
    Input: "a,b,c, ",
    Output: [["a", "b", "c", ""]],
    TrimLeadingSpace: true,
  },
  {
    Name: "TrailingCommaSpaceEOL",
    Input: "a,b,c, \n",
    Output: [["a", "b", "c", ""]],
    TrimLeadingSpace: true,
  },
  {
    Name: "TrailingCommaLine3",
    Input: "a,b,c\nd,e,f\ng,hi,",
    Output: [
      ["a", "b", "c"],
      ["d", "e", "f"],
      ["g", "hi", ""],
    ],
    TrimLeadingSpace: true,
  },
  {
    Name: "NotTrailingComma3",
    Input: "a,b,c, \n",
    Output: [["a", "b", "c", " "]],
  },
  {
    Name: "CommaFieldTest",
    Input: `x,y,z,w
x,y,z,
x,y,,
x,,,
,,,
"x","y","z","w"
"x","y","z",""
"x","y","",""
"x","","",""
"","","",""
`,
    Output: [
      ["x", "y", "z", "w"],
      ["x", "y", "z", ""],
      ["x", "y", "", ""],
      ["x", "", "", ""],
      ["", "", "", ""],
      ["x", "y", "z", "w"],
      ["x", "y", "z", ""],
      ["x", "y", "", ""],
      ["x", "", "", ""],
      ["", "", "", ""],
    ],
  },
  {
    Name: "TrailingCommaIneffective1",
    Input: "a,b,\nc,d,e",
    Output: [
      ["a", "b", ""],
      ["c", "d", "e"],
    ],
    TrimLeadingSpace: true,
  },
  {
    Name: "ReadAllReuseRecord",
    Input: "a,b\nc,d",
    Output: [
      ["a", "b"],
      ["c", "d"],
    ],
    ReuseRecord: true,
  },
  {
    Name: "StartLine1", // Issue 19019
    Input: 'a,"b\nc"d,e',
    Error: new ParseError(1, 2, 1, ERR_QUOTE),
  },
  {
    Name: "StartLine2",
    Input: 'a,b\n"d\n\n,e',
    Error: new ParseError(2, 5, 0, ERR_QUOTE),
  },
  {
    Name: "CRLFInQuotedField", // Issue 21201
    Input: 'A,"Hello\r\nHi",B\r\n',
    Output: [["A", "Hello\nHi", "B"]],
  },
  {
    Name: "BinaryBlobField", // Issue 19410
    Input: "x09\x41\xb4\x1c,aktau",
    Output: [["x09A\xb4\x1c", "aktau"]],
  },
  {
    Name: "TrailingCR",
    Input: "field1,field2\r",
    Output: [["field1", "field2"]],
  },
  {
    Name: "QuotedTrailingCR",
    Input: '"field"\r',
    Output: [["field"]],
  },
  {
    Name: "QuotedTrailingCRCR",
    Input: '"field"\r\r',
    Error: new ParseError(1, 1, 6, ERR_QUOTE),
  },
  {
    Name: "FieldCR",
    Input: "field\rfield\r",
    Output: [["field\rfield"]],
  },
  {
    Name: "FieldCRCR",
    Input: "field\r\rfield\r\r",
    Output: [["field\r\rfield\r"]],
  },
  {
    Name: "FieldCRCRLF",
    Input: "field\r\r\nfield\r\r\n",
    Output: [["field\r"], ["field\r"]],
  },
  {
    Name: "FieldCRCRLFCR",
    Input: "field\r\r\n\rfield\r\r\n\r",
    Output: [["field\r"], ["\rfield\r"]],
  },
  {
    Name: "FieldCRCRLFCRCR",
    Input: "field\r\r\n\r\rfield\r\r\n\r\r",
    Output: [["field\r"], ["\r\rfield\r"], ["\r"]],
  },
  {
    Name: "MultiFieldCRCRLFCRCR",
    Input: "field1,field2\r\r\n\r\rfield1,field2\r\r\n\r\r,",
    Output: [
      ["field1", "field2\r"],
      ["\r\rfield1", "field2\r"],
      ["\r\r", ""],
    ],
  },
  {
    Name: "NonASCIICommaAndComment",
    Input: "a£b,c£ \td,e\n€ comment\n",
    Output: [["a", "b,c", "d,e"]],
    TrimLeadingSpace: true,
    Separator: "£",
    Comment: "€",
  },
  {
    Name: "NonASCIICommaAndCommentWithQuotes",
    Input: 'a€"  b,"€ c\nλ comment\n',
    Output: [["a", "  b,", " c"]],
    Separator: "€",
    Comment: "λ",
  },
  {
    // λ and θ start with the same byte.
    // This tests that the parser doesn't confuse such characters.
    Name: "NonASCIICommaConfusion",
    Input: '"abθcd"λefθgh',
    Output: [["abθcd", "efθgh"]],
    Separator: "λ",
    Comment: "€",
  },
  {
    Name: "NonASCIICommentConfusion",
    Input: "λ\nλ\nθ\nλ\n",
    Output: [["λ"], ["λ"], ["λ"]],
    Comment: "θ",
  },
  {
    Name: "QuotedFieldMultipleLF",
    Input: '"\n\n\n\n"',
    Output: [["\n\n\n\n"]],
  },
  {
    Name: "MultipleCRLF",
    Input: "\r\n\r\n\r\n\r\n",
    Output: [],
  },
  /**
   * The implementation may read each line in several chunks if
   * it doesn't fit entirely.
   * in the read buffer, so we should test the code to handle that condition.
   */
  {
    Name: "HugeLines",
    Input: "#ignore\n".repeat(10000) + "@".repeat(5000) + "," +
      "*".repeat(5000),
    Output: [["@".repeat(5000), "*".repeat(5000)]],
    Comment: "#",
  },
  {
    Name: "QuoteWithTrailingCRLF",
    Input: '"foo"bar"\r\n',
    Error: new ParseError(1, 1, 4, ERR_QUOTE),
  },
  {
    Name: "LazyQuoteWithTrailingCRLF",
    Input: '"foo"bar"\r\n',
    Output: [[`foo"bar`]],
    LazyQuotes: true,
  },
  {
    Name: "DoubleQuoteWithTrailingCRLF",
    Input: '"foo""bar"\r\n',
    Output: [[`foo"bar`]],
  },
  {
    Name: "EvenQuotes",
    Input: `""""""""`,
    Output: [[`"""`]],
  },
  {
    Name: "OddQuotes",
    Input: `"""""""`,
    Error: new ParseError(1, 1, 7, ERR_QUOTE),
  },
  {
    Name: "LazyOddQuotes",
    Input: `"""""""`,
    Output: [[`"""`]],
    LazyQuotes: true,
  },
  {
    Name: "BadComma1",
    Separator: "\n",
    Error: new Error(ERR_INVALID_DELIM),
  },
  {
    Name: "BadComma2",
    Separator: "\r",
    Error: new Error(ERR_INVALID_DELIM),
  },
  {
    Name: "BadComma3",
    Separator: '"',
    Error: new Error(ERR_INVALID_DELIM),
  },
  {
    Name: "BadComment1",
    Comment: "\n",
    Error: new Error(ERR_INVALID_DELIM),
  },
  {
    Name: "BadComment2",
    Comment: "\r",
    Error: new Error(ERR_INVALID_DELIM),
  },
  {
    Name: "BadCommaComment",
    Separator: "X",
    Comment: "X",
    Error: new Error(ERR_INVALID_DELIM),
  },
];
for (const t of testCases) {
  Deno.test({
    name: `[CSV] ${t.Name}`,
    async fn() {
      let separator = ",";
      let comment: string | undefined;
      let fieldsPerRec: number | undefined;
      let trim = false;
      let lazyquote = false;
      if (t.Separator) {
        separator = t.Separator;
      }
      if (t.Comment) {
        comment = t.Comment;
      }
      if (t.TrimLeadingSpace) {
        trim = true;
      }
      if (t.UseFieldsPerRecord) {
        fieldsPerRec = t.FieldsPerRecord;
      }
      if (t.LazyQuotes) {
        lazyquote = t.LazyQuotes;
      }
      let actual;
      if (t.Error) {
        const err = await assertThrowsAsync(async () => {
          await readMatrix(new BufReader(new StringReader(t.Input ?? "")), {
            separator,
            comment: comment,
            trimLeadingSpace: trim,
            fieldsPerRecord: fieldsPerRec,
            lazyQuotes: lazyquote,
          });
        });

        assertEquals(err, t.Error);
      } else {
        actual = await readMatrix(
          new BufReader(new StringReader(t.Input ?? "")),
          {
            separator,
            comment: comment,
            trimLeadingSpace: trim,
            fieldsPerRecord: fieldsPerRec,
            lazyQuotes: lazyquote,
          },
        );
        const expected = t.Output;
        assertEquals(actual, expected);
      }
    },
  });
}

const parseTestCases = [
  {
    name: "simple",
    in: "a,b,c",
    skipFirstRow: false,
    result: [["a", "b", "c"]],
  },
  {
    name: "simple Bufreader",
    in: new BufReader(new StringReader("a,b,c")),
    skipFirstRow: false,
    result: [["a", "b", "c"]],
  },
  {
    name: "multiline",
    in: "a,b,c\ne,f,g\n",
    skipFirstRow: false,
    result: [
      ["a", "b", "c"],
      ["e", "f", "g"],
    ],
  },
  {
    name: "header mapping boolean",
    in: "a,b,c\ne,f,g\n",
    skipFirstRow: true,
    result: [{ a: "e", b: "f", c: "g" }],
  },
  {
    name: "header mapping array",
    in: "a,b,c\ne,f,g\n",
    columns: ["this", "is", "sparta"],
    result: [
      { this: "a", is: "b", sparta: "c" },
      { this: "e", is: "f", sparta: "g" },
    ],
  },
  {
    name: "header mapping object",
    in: "a,b,c\ne,f,g\n",
    columns: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
    result: [
      { this: "a", is: "b", sparta: "c" },
      { this: "e", is: "f", sparta: "g" },
    ],
  },
  {
    name: "header mapping parse entry",
    in: "a,b,c\ne,f,g\n",
    columns: [
      {
        name: "this",
        parse: (e: string): string => {
          return `b${e}$$`;
        },
      },
      {
        name: "is",
        parse: (e: string): number => {
          return e.length;
        },
      },
      {
        name: "sparta",
        parse: (e: string): unknown => {
          return { bim: `boom-${e}` };
        },
      },
    ],
    result: [
      { this: "ba$$", is: 1, sparta: { bim: `boom-c` } },
      { this: "be$$", is: 1, sparta: { bim: `boom-g` } },
    ],
  },
  {
    name: "multiline parse",
    in: "a,b,c\ne,f,g\n",
    parse: (e: string[]): unknown => {
      return { super: e[0], street: e[1], fighter: e[2] };
    },
    skipFirstRow: false,
    result: [
      { super: "a", street: "b", fighter: "c" },
      { super: "e", street: "f", fighter: "g" },
    ],
  },
  {
    name: "header mapping object parseline",
    in: "a,b,c\ne,f,g\n",
    columns: [{ name: "this" }, { name: "is" }, { name: "sparta" }],
    parse: (e: Record<string, unknown>): unknown => {
      return { super: e.this, street: e.is, fighter: e.sparta };
    },
    result: [
      { super: "a", street: "b", fighter: "c" },
      { super: "e", street: "f", fighter: "g" },
    ],
  },
  {
    name: "provides both opts.skipFirstRow and opts.columns",
    in: "a,b,1\nc,d,2\ne,f,3",
    skipFirstRow: true,
    columns: [
      { name: "foo" },
      { name: "bar" },
      { name: "baz", parse: (e: string) => Number(e) },
    ],
    result: [
      { foo: "c", bar: "d", baz: 2 },
      { foo: "e", bar: "f", baz: 3 },
    ],
  },
];

for (const testCase of parseTestCases) {
  Deno.test({
    name: `[CSV] Parse ${testCase.name}`,
    async fn() {
      const r = await parse(testCase.in, {
        skipFirstRow: testCase.skipFirstRow,
        columns: testCase.columns,
        parse: testCase.parse as (input: unknown) => unknown,
      });
      assertEquals(r, testCase.result);
    },
  });
}

Deno.test({
  name: "[CSV] ParseError.message",
  fn(): void {
    assertEquals(
      new ParseError(2, 2, null, ERR_FIELD_COUNT).message,
      `record on line 2: ${ERR_FIELD_COUNT}`,
    );

    assertEquals(
      new ParseError(1, 2, 1, ERR_QUOTE).message,
      `record on line 1; parse error on line 2, column 1: ${ERR_QUOTE}`,
    );

    assertEquals(
      new ParseError(1, 1, 7, ERR_QUOTE).message,
      `parse error on line 1, column 7: ${ERR_QUOTE}`,
    );
  },
});