std/csv/parse_stream_test.ts

583 lines
16 KiB
TypeScript

// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
import { CsvParseStream } from "./parse_stream.ts";
import type { CsvParseStreamOptions } from "./parse_stream.ts";
import { assert, assertEquals, assertRejects } from "@std/assert";
import type { AssertTrue, IsExact } from "@std/testing/types";
import { fromFileUrl, join } from "@std/path";
import { delay } from "@std/async/delay";
const testdataDir = join(fromFileUrl(import.meta.url), "../testdata");
const encoder = new TextEncoder();
Deno.test({
name: "CsvParseStream should work with Deno.FsFile's readable",
permissions: {
read: [testdataDir],
},
fn: async () => {
const file = await Deno.open(join(testdataDir, "simple.csv"));
const readable = file.readable
.pipeThrough(new TextDecoderStream())
.pipeThrough(new CsvParseStream());
const records = await Array.fromAsync(readable);
assertEquals(records, [
["id", "name"],
["1", "foobar"],
["2", "barbaz"],
]);
},
});
Deno.test({
name: "CsvParseStream throws at invalid csv line",
fn: async () => {
const readable = ReadableStream.from([
encoder.encode("id,name\n"),
encoder.encode("\n"),
encoder.encode("1,foo\n"),
encoder.encode('2,"baz\n'),
]).pipeThrough(new TextDecoderStream()).pipeThrough(
new CsvParseStream(),
);
const reader = readable.getReader();
assertEquals(await reader.read(), { done: false, value: ["id", "name"] });
assertEquals(await reader.read(), { done: false, value: ["1", "foo"] });
await assertRejects(
() => reader.read(),
SyntaxError,
`Syntax error on line 4; parse error on line 5, column 1: extraneous or missing " in quoted-field`,
);
},
});
Deno.test({
name: "CsvParseStream handles various inputs",
permissions: "none",
fn: async (t) => {
// These test cases were originally ported from Go:
// https://github.com/golang/go/blob/go1.12.5/src/encoding/csv/
// Copyright 2011 The Go Authors. All rights reserved. BSD license.
// https://github.com/golang/go/blob/master/LICENSE
const testCases = [
{
name: "CRLF",
input: "a,b\r\nc,d\r\n",
output: [["a", "b"], ["c", "d"]],
},
{
name: "BareCR",
input: "a,b\rc,d\r\n",
output: [["a", "b\rc", "d"]],
},
{
name: "NoEOLTest",
input: "a,b,c",
output: [["a", "b", "c"]],
},
{
name: "Semicolon",
input: "a;b;c\n",
output: [["a", "b", "c"]],
separator: ";",
},
{
name: "Separator is undefined",
input: "a,b,c\n",
output: [["a", "b", "c"]],
separator: undefined,
},
{
name: "MultiLine",
input: `"two
line","one line","three
line
field"`,
output: [["two\nline", "one line", "three\nline\nfield"]],
},
{
name: "BlankLine",
input: "a,b,c\n\nd,e,f\n\n",
output: [
["a", "b", "c"],
["d", "e", "f"],
],
},
{
name: "LeadingSpace",
input: " a, b, c\n",
output: [[" a", " b", " c"]],
},
{
name: "trimLeadingSpace = true",
input: " a, b, c\n",
output: [["a", "b", "c"]],
trimLeadingSpace: true,
},
{
name: "Comment",
input: "#1,2,3\na,b,c\n#comment",
output: [["a", "b", "c"]],
comment: "#",
},
{
name: "NoComment",
input: "#1,2,3\na,b,c",
output: [
["#1", "2", "3"],
["a", "b", "c"],
],
},
{
name: "fieldsPerRecord - variable number of fields is allowed",
input: "a,b,c\nd,e",
output: [
["a", "b", "c"],
["d", "e"],
],
},
{
name: "fieldsPerRecord = -42 - variable number of fields is allowed",
input: "a,b,c\nd,e",
output: [
["a", "b", "c"],
["d", "e"],
],
fieldsPerRecord: -42,
},
{
name:
"fieldsPerRecord = 0 - the number of fields is inferred from the first row",
input: "a,b,c\nd,e,f",
output: [
["a", "b", "c"],
["d", "e", "f"],
],
fieldsPerRecord: 0,
},
{
name:
"fieldsPerRecord = 0 - inferred number of fields does not match subsequent rows",
input: "a,b,c\nd,e",
fieldsPerRecord: 0,
error: {
klass: SyntaxError,
msg: "Syntax error on line 2: expected 3 fields but got 2",
},
},
{
name:
"fieldsPerRecord = 3 - SyntaxError is thrown when the number of fields is not 2",
input: "a,b,c\nd,e",
fieldsPerRecord: 3,
error: {
klass: SyntaxError,
msg: "Syntax error on line 2: expected 3 fields but got 2",
},
},
{
name: "TrailingCommaEOF",
input: "a,b,c,",
output: [["a", "b", "c", ""]],
},
{
name: "TrailingCommaEOL",
input: "a,b,c,\n",
output: [["a", "b", "c", ""]],
},
{
name: "NotTrailingComma3",
input: "a,b,c, \n",
output: [["a", "b", "c", " "]],
},
{
name: "CommaFieldTest",
input: `x,y,z,w
x,y,z,
x,y,,
x,,,
,,,
"x","y","z","w"
"x","y","z",""
"x","y","",""
"x","","",""
"","","",""
`,
output: [
["x", "y", "z", "w"],
["x", "y", "z", ""],
["x", "y", "", ""],
["x", "", "", ""],
["", "", "", ""],
["x", "y", "z", "w"],
["x", "y", "z", ""],
["x", "y", "", ""],
["x", "", "", ""],
["", "", "", ""],
],
},
{
name: "CRLFInQuotedField", // Issue 21201
input: 'A,"Hello\r\nHi",B\r\n',
output: [["A", "Hello\nHi", "B"]],
},
{
name: "BinaryBlobField", // Issue 19410
input: "x09\x41\xb4\x1c,aktau",
output: [["x09A\xb4\x1c", "aktau"]],
},
{
name: "TrailingCR",
input: "field1,field2\r",
output: [["field1", "field2"]],
},
{
name: "QuotedTrailingCR",
input: '"field"\r',
output: [["field"]],
},
{
name: "FieldCR",
input: "field\rfield\r",
output: [["field\rfield"]],
},
{
name: "FieldCRCR",
input: "field\r\rfield\r\r",
output: [["field\r\rfield\r"]],
},
{
name: "FieldCRCRLF",
input: "field\r\r\nfield\r\r\n",
output: [["field\r"], ["field\r"]],
},
{
name: "FieldCRCRLFCR",
input: "field\r\r\n\rfield\r\r\n\r",
output: [["field\r"], ["\rfield\r"]],
},
{
name: "MultiFieldCRCRLFCRCR",
input: "field1,field2\r\r\n\r\rfield1,field2\r\r\n\r\r,",
output: [
["field1", "field2\r"],
["\r\rfield1", "field2\r"],
["\r\r", ""],
],
},
{
name: "NonASCIICommaAndCommentWithQuotes",
input: 'a€" b,"€ c\nλ comment\n',
output: [["a", " b,", " c"]],
separator: "€",
comment: "λ",
},
{
// λ and θ start with the same byte.
// This tests that the parser doesn't confuse such characters.
name: "NonASCIICommaConfusion",
input: '"abθcd"λefθgh',
output: [["abθcd", "efθgh"]],
separator: "λ",
comment: "€",
},
{
name: "NonASCIICommentConfusion",
input: "λ\nλ\nθ\nλ\n",
output: [["λ"], ["λ"], ["λ"]],
comment: "θ",
},
{
name: "QuotedFieldMultipleLF",
input: '"\n\n\n\n"',
output: [["\n\n\n\n"]],
},
{
name: "MultipleCRLF",
input: "\r\n\r\n\r\n\r\n",
output: [],
},
{
name: "DoubleQuoteWithTrailingCRLF",
input: '"foo""bar"\r\n',
output: [[`foo"bar`]],
},
{
name: "EvenQuotes",
input: `""""""""`,
output: [[`"""`]],
},
{
name: "simple",
input: "a,b,c",
output: [["a", "b", "c"]],
skipFirstRow: false,
},
{
name: "multiline",
input: "a,b,c\ne,f,g\n",
output: [
["a", "b", "c"],
["e", "f", "g"],
],
skipFirstRow: false,
},
{
name: "header mapping boolean",
input: "a,b,c\ne,f,g\n",
output: [{ a: "e", b: "f", c: "g" }],
skipFirstRow: true,
},
{
name: "header mapping array",
input: "a,b,c\ne,f,g\n",
output: [
{ this: "a", is: "b", sparta: "c" },
{ this: "e", is: "f", sparta: "g" },
],
columns: ["this", "is", "sparta"],
},
{
name: "provides both opts.skipFirstRow and opts.columns",
input: "a,b,1\nc,d,2\ne,f,3",
output: [
{ foo: "c", bar: "d", baz: "2" },
{ foo: "e", bar: "f", baz: "3" },
],
skipFirstRow: true,
columns: ["foo", "bar", "baz"],
},
{
name: "mismatching number of headers and fields 1",
input: "a,b,c\nd,e",
skipFirstRow: true,
columns: ["foo", "bar", "baz"],
error: {
klass: Error,
msg:
"Syntax error on line 2: The record has 2 fields, but the header has 3 fields",
},
},
{
name: "mismatching number of headers and fields 2",
input: "a,b,c\nd,e,,g",
skipFirstRow: true,
columns: ["foo", "bar", "baz"],
error: {
klass: Error,
msg:
"Syntax error on line 2: The record has 4 fields, but the header has 3 fields",
},
},
{
name: "bad quote in bare field",
input: `a "word",1,2,3`,
error: {
klass: SyntaxError,
msg:
'Syntax error on line 1; parse error on line 1, column 3: bare " in non-quoted-field',
},
},
{
name: "bad quote in quoted field",
input: `"wo"rd",1,2,3`,
error: {
klass: SyntaxError,
msg:
'Syntax error on line 1; parse error on line 1, column 4: extraneous or missing " in quoted-field',
},
},
{
name: "bad quote at line 1 in quoted field with newline",
input: `"w\n\no"rd",1,2,3`,
error: {
klass: SyntaxError,
msg:
'Syntax error on line 1; parse error on line 3, column 2: extraneous or missing " in quoted-field',
},
},
{
name: "bad quote at line 2 in quoted field with newline",
input: `a,b,c,d\n"w\n\no"rd",1,2,3`,
error: {
klass: SyntaxError,
msg:
'Syntax error on line 2; parse error on line 4, column 2: extraneous or missing " in quoted-field',
},
},
{
name: "lazy quote",
input: `a "word","1"2",a","b`,
output: [[`a "word"`, `1"2`, `a"`, `b`]],
lazyQuotes: true,
},
];
for (const testCase of testCases) {
await t.step(testCase.name, async () => {
const options: CsvParseStreamOptions = {};
if ("separator" in testCase) {
// @ts-expect-error: explicitly giving undefined
options.separator = testCase.separator;
}
if ("comment" in testCase) {
options.comment = testCase.comment;
}
if ("trimLeadingSpace" in testCase) {
options.trimLeadingSpace = testCase.trimLeadingSpace;
}
if ("lazyQuotes" in testCase) {
options.lazyQuotes = testCase.lazyQuotes;
}
if ("fieldsPerRecord" in testCase) {
options.fieldsPerRecord = testCase.fieldsPerRecord;
}
if ("skipFirstRow" in testCase) {
options.skipFirstRow = testCase.skipFirstRow;
}
if ("columns" in testCase) {
options.columns = testCase.columns;
}
const readable = ReadableStream.from([testCase.input])
.pipeThrough(new CsvParseStream(options));
if (testCase.output) {
const actual = await Array.fromAsync(readable);
assertEquals(actual, testCase.output);
} else {
assert(testCase.error);
await assertRejects(
async () => {
for await (const _ of readable);
},
testCase.error.klass,
testCase.error.msg,
);
}
});
}
},
});
Deno.test({
name:
"CsvParseStream.cancel() does not leak file when called in the middle of iteration",
permissions: { read: [testdataDir] },
fn: async () => {
const file = await Deno.open(join(testdataDir, "large.csv"));
const readable = file.readable
.pipeThrough(new TextDecoderStream())
.pipeThrough(new CsvParseStream());
for await (const _record of readable) {
break;
}
// FIXME(kt3k): Remove this delay.
await delay(100);
},
});
Deno.test({
name: "CsvParseStream is correctly typed",
fn() {
// If no option is passed, defaults to ReadableStream<string[]>.
{
const { readable } = new CsvParseStream();
type _ = AssertTrue<IsExact<typeof readable, ReadableStream<string[]>>>;
}
{
const { readable } = new CsvParseStream(undefined);
type _ = AssertTrue<IsExact<typeof readable, ReadableStream<string[]>>>;
}
{
// `skipFirstRow` may be `true` or `false`.
// `columns` may be `undefined` or `string[]`.
// If you don't know exactly what the value of the option is,
// the return type is ReadableStream<string[] | Record<string, string>>
const options: CsvParseStreamOptions = {};
const { readable } = new CsvParseStream(options);
type _ = AssertTrue<
IsExact<
typeof readable,
ReadableStream<string[] | Record<string, string>>
>
>;
}
{
const { readable } = new CsvParseStream({});
type _ = AssertTrue<IsExact<typeof readable, ReadableStream<string[]>>>;
}
// skipFirstRow option
{
const { readable } = new CsvParseStream({});
type _ = AssertTrue<IsExact<typeof readable, ReadableStream<string[]>>>;
}
{
const { readable } = new CsvParseStream({ skipFirstRow: false });
type _ = AssertTrue<IsExact<typeof readable, ReadableStream<string[]>>>;
}
{
const { readable } = new CsvParseStream({ skipFirstRow: true });
type _ = AssertTrue<
IsExact<
typeof readable,
ReadableStream<Record<string, string>>
>
>;
}
// columns option
{
const { readable } = new CsvParseStream({});
type _ = AssertTrue<IsExact<typeof readable, ReadableStream<string[]>>>;
}
{
const { readable } = new CsvParseStream({ columns: ["aaa", "bbb"] });
type _ = AssertTrue<
IsExact<typeof readable, ReadableStream<Record<"aaa" | "bbb", string>>>
>;
}
{
const { readable } = new CsvParseStream({ columns: ["aaa"] as string[] });
type _ = AssertTrue<
IsExact<
typeof readable,
ReadableStream<Record<string, string>>
>
>;
}
// skipFirstRow option + columns option
{
const { readable } = new CsvParseStream({ skipFirstRow: false });
type _ = AssertTrue<IsExact<typeof readable, ReadableStream<string[]>>>;
}
{
const { readable } = new CsvParseStream({ skipFirstRow: true });
type _ = AssertTrue<
IsExact<
typeof readable,
ReadableStream<Record<string, string>>
>
>;
}
{
const { readable } = new CsvParseStream({
skipFirstRow: false,
columns: ["aaa"],
});
type _ = AssertTrue<
IsExact<typeof readable, ReadableStream<Record<"aaa", string>>>
>;
}
{
const { readable } = new CsvParseStream({
skipFirstRow: true,
columns: ["aaa"],
});
type _ = AssertTrue<
IsExact<typeof readable, ReadableStream<Record<"aaa", string>>>
>;
}
},
});