diff --git a/encoding/csv.ts b/encoding/csv.ts index c8c7719ca..711e27772 100644 --- a/encoding/csv.ts +++ b/encoding/csv.ts @@ -1,5 +1,7 @@ // Ported from Go: // https://github.com/golang/go/blob/go1.12.5/src/encoding/csv/ +// Copyright 2011 The Go Authors. All rights reserved. BSD license. +// https://github.com/golang/go/blob/master/LICENSE // Copyright 2018-2020 the Deno authors. All rights reserved. MIT license. import { BufReader } from "../io/bufio.ts"; @@ -9,6 +11,11 @@ import { assert } from "../testing/asserts.ts"; const INVALID_RUNE = ["\r", "\n", '"']; +export const ERR_BARE_QUOTE = 'bare " in non-quoted-field'; +export const ERR_QUOTE = 'extraneous or missing " in quoted-field'; +export const ERR_INVALID_DELIM = "Invalid Delimiter"; +export const ERR_FIELD_COUNT = "wrong number of fields"; + export class ParseError extends Error { StartLine: number; Line: number; @@ -49,23 +56,146 @@ function chkOptions(opt: ReadOptions): void { (typeof opt.comment === "string" && INVALID_RUNE.includes(opt.comment)) || opt.comma === opt.comment ) { - throw new Error("Invalid Delimiter"); + throw new Error(ERR_INVALID_DELIM); } } -async function read( +async function readRecord( Startline: number, reader: BufReader, opt: ReadOptions = { comma: ",", trimLeadingSpace: false } ): Promise { const tp = new TextProtoReader(reader); - let line: string; - let result: string[] = []; const lineIndex = Startline; + let line = await readLine(tp); + if (line === Deno.EOF) return Deno.EOF; + if (line.length === 0) { + return []; + } + // line starting with comment character is ignored + if (opt.comment && line[0] === opt.comment) { + return []; + } + + assert(opt.comma != null); + + let quoteError: string | null = null; + const quote = '"'; + const quoteLen = quote.length; + const commaLen = opt.comma.length; + let recordBuffer = ""; + const fieldIndexes = [] as number[]; + parseField: for (;;) { + if (opt.trimLeadingSpace) { + line = line.trimLeft(); + } + + if (line.length === 0 || !line.startsWith(quote)) { + // Non-quoted string field + const i = line.indexOf(opt.comma); + let field = line; + if (i >= 0) { + field = field.substring(0, i); + } + // Check to make sure a quote does not appear in field. + if (!opt.lazyQuotes) { + const j = field.indexOf(quote); + if (j >= 0) { + quoteError = ERR_BARE_QUOTE; + break parseField; + } + } + recordBuffer += field; + fieldIndexes.push(recordBuffer.length); + if (i >= 0) { + line = line.substring(i + commaLen); + continue parseField; + } + break parseField; + } else { + // Quoted string field + line = line.substring(quoteLen); + for (;;) { + const i = line.indexOf(quote); + if (i >= 0) { + // Hit next quote. + recordBuffer += line.substring(0, i); + line = line.substring(i + quoteLen); + if (line.startsWith(quote)) { + // `""` sequence (append quote). + recordBuffer += quote; + line = line.substring(quoteLen); + } else if (line.startsWith(opt.comma)) { + // `","` sequence (end of field). + line = line.substring(commaLen); + fieldIndexes.push(recordBuffer.length); + continue parseField; + } else if (0 === line.length) { + // `"\n` sequence (end of line). + fieldIndexes.push(recordBuffer.length); + break parseField; + } else if (opt.lazyQuotes) { + // `"` sequence (bare quote). + recordBuffer += quote; + } else { + // `"*` sequence (invalid non-escaped quote). + quoteError = ERR_QUOTE; + break parseField; + } + } else if (line.length > 0 || !(await isEOF(tp))) { + // Hit end of line (copy all data so far). + recordBuffer += line; + const r = await readLine(tp); + if (r === Deno.EOF) { + if (!opt.lazyQuotes) { + quoteError = ERR_QUOTE; + break parseField; + } + fieldIndexes.push(recordBuffer.length); + break parseField; + } + recordBuffer += "\n"; // preserve line feed (This is because TextProtoReader removes it.) + line = r; + } else { + // Abrupt end of file (EOF on error). + if (!opt.lazyQuotes) { + quoteError = ERR_QUOTE; + break parseField; + } + fieldIndexes.push(recordBuffer.length); + break parseField; + } + } + } + } + if (quoteError) { + throw new ParseError(Startline, lineIndex, quoteError); + } + const result = [] as string[]; + let preIdx = 0; + for (const i of fieldIndexes) { + result.push(recordBuffer.slice(preIdx, i)); + preIdx = i; + } + return result; +} + +async function isEOF(tp: TextProtoReader): Promise { + return (await tp.r.peek(0)) === Deno.EOF; +} + +async function readLine(tp: TextProtoReader): Promise { + let line: string; const r = await tp.readLine(); if (r === Deno.EOF) return Deno.EOF; line = r; + + // For backwards compatibility, drop trailing \r before EOF. + if ((await isEOF(tp)) && line.length > 0 && line[line.length - 1] === "\r") { + line = line.substring(0, line.length - 1); + } + // Normalize \r\n to \n on all input lines. if ( line.length >= 2 && @@ -76,41 +206,7 @@ async function read( line = line + "\n"; } - const trimmedLine = line.trimLeft(); - if (trimmedLine.length === 0) { - return []; - } - - // line starting with comment character is ignored - if (opt.comment && trimmedLine[0] === opt.comment) { - return []; - } - - assert(opt.comma != null); - result = line.split(opt.comma); - - let quoteError = false; - result = result.map((r): string => { - if (opt.trimLeadingSpace) { - r = r.trimLeft(); - } - if (r[0] === '"' && r[r.length - 1] === '"') { - r = r.substring(1, r.length - 1); - } else if (r[0] === '"') { - r = r.substring(1, r.length); - } - - if (!opt.lazyQuotes) { - if (r[0] !== '"' && r.indexOf('"') !== -1) { - quoteError = true; - } - } - return r; - }); - if (quoteError) { - throw new ParseError(Startline, lineIndex, 'bare " in non-quoted-field'); - } - return result; + return line; } export async function readMatrix( @@ -129,7 +225,7 @@ export async function readMatrix( chkOptions(opt); for (;;) { - const r = await read(lineIndex, reader, opt); + const r = await readRecord(lineIndex, reader, opt); if (r === Deno.EOF) break; lineResult = r; lineIndex++; @@ -148,7 +244,7 @@ export async function readMatrix( if (lineResult.length > 0) { if (_nbFields && _nbFields !== lineResult.length) { - throw new ParseError(lineIndex, lineIndex, "wrong number of fields"); + throw new ParseError(lineIndex, lineIndex, ERR_FIELD_COUNT); } result.push(lineResult); } diff --git a/encoding/csv_test.ts b/encoding/csv_test.ts index cb61de433..b3d4ec0c9 100644 --- a/encoding/csv_test.ts +++ b/encoding/csv_test.ts @@ -1,15 +1,21 @@ // Test ported from Golang // https://github.com/golang/go/blob/2cc15b1/src/encoding/csv/reader_test.go +// Copyright 2011 The Go Authors. All rights reserved. BSD license. +// https://github.com/golang/go/blob/master/LICENSE +// Copyright 2018-2020 the Deno authors. All rights reserved. MIT license. + import { assertEquals, assert } from "../testing/asserts.ts"; -import { readMatrix, parse } from "./csv.ts"; +import { + readMatrix, + parse, + ERR_BARE_QUOTE, + ERR_QUOTE, + ERR_INVALID_DELIM, + ERR_FIELD_COUNT, +} from "./csv.ts"; import { StringReader } from "../io/readers.ts"; import { BufReader } from "../io/bufio.ts"; -const ErrInvalidDelim = "Invalid Delimiter"; -const ErrFieldCount = "wrong number of fields"; -const ErrBareQuote = 'bare " in non-quoted-field'; - -// TODO(zekth): Activate remaining tests const testCases = [ { Name: "Simple", @@ -43,7 +49,6 @@ zzz,yyy,xxx`, ["a,a", `bbb`, "ccc"], ["zzz", "yyy", "xxx"], ], - ignore: true, }, { Name: "NoEOLTest", @@ -62,8 +67,7 @@ zzz,yyy,xxx`, line","one line","three line field"`, - Output: [["two\nline"], ["one line"], ["three\nline\nfield"]], - ignore: true, + Output: [["two\nline", "one line", "three\nline\nfield"]], }, { Name: "BlankLine", @@ -129,7 +133,7 @@ field"`, { Name: "BadDoubleQuotes", Input: `a""b,c`, - Error: ErrBareQuote, + Error: ERR_BARE_QUOTE, // Error: &ParseError{StartLine: 1, Line: 1, Column: 1, Err: ErrBareQuote}, }, { @@ -141,23 +145,23 @@ field"`, { Name: "BadBareQuote", Input: `a "word","b"`, - Error: ErrBareQuote, + Error: ERR_BARE_QUOTE, // &ParseError{StartLine: 1, Line: 1, Column: 2, Err: ErrBareQuote} }, { Name: "BadTrailingQuote", Input: `"a word",b"`, - Error: ErrBareQuote, + Error: ERR_BARE_QUOTE, }, { Name: "ExtraneousQuote", Input: `"a "word","b"`, - Error: ErrBareQuote, + Error: ERR_QUOTE, }, { Name: "BadFieldCount", Input: "a,b,c\nd,e", - Error: ErrFieldCount, + Error: ERR_FIELD_COUNT, UseFieldsPerRecord: true, FieldsPerRecord: 0, }, @@ -167,7 +171,7 @@ field"`, // Error: &ParseError{StartLine: 1, Line: 1, Err: ErrFieldCount}, UseFieldsPerRecord: true, FieldsPerRecord: 2, - Error: ErrFieldCount, + Error: ERR_FIELD_COUNT, }, { Name: "FieldCount", @@ -261,22 +265,19 @@ x,,, { Name: "StartLine1", // Issue 19019 Input: 'a,"b\nc"d,e', - Error: true, + Error: ERR_QUOTE, // Error: &ParseError{StartLine: 1, Line: 2, Column: 1, Err: ErrQuote}, - ignore: true, }, { Name: "StartLine2", Input: 'a,b\n"d\n\n,e', - Error: true, + Error: ERR_QUOTE, // Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote}, - ignore: true, }, { Name: "CRLFInQuotedField", // Issue 21201 Input: 'A,"Hello\r\nHi",B\r\n', Output: [["A", "Hello\nHi", "B"]], - ignore: true, }, { Name: "BinaryBlobField", // Issue 19410 @@ -287,32 +288,27 @@ x,,, Name: "TrailingCR", Input: "field1,field2\r", Output: [["field1", "field2"]], - ignore: true, }, { Name: "QuotedTrailingCR", Input: '"field"\r', - Output: [['"field"']], - ignore: true, + Output: [["field"]], }, { Name: "QuotedTrailingCRCR", Input: '"field"\r\r', - Error: true, + Error: ERR_QUOTE, // Error: &ParseError{StartLine: 1, Line: 1, Column: 6, Err: ErrQuote}, - ignore: true, }, { Name: "FieldCR", Input: "field\rfield\r", Output: [["field\rfield"]], - ignore: true, }, { Name: "FieldCRCR", Input: "field\r\rfield\r\r", Output: [["field\r\rfield\r"]], - ignore: true, }, { Name: "FieldCRCRLF", @@ -328,7 +324,6 @@ x,,, Name: "FieldCRCRLFCRCR", Input: "field\r\r\n\r\rfield\r\r\n\r\r", Output: [["field\r"], ["\r\rfield\r"], ["\r"]], - ignore: true, }, { Name: "MultiFieldCRCRLFCRCR", @@ -338,7 +333,6 @@ x,,, ["\r\rfield1", "field2\r"], ["\r\r", ""], ], - ignore: true, }, { Name: "NonASCIICommaAndComment", @@ -374,12 +368,11 @@ x,,, Name: "QuotedFieldMultipleLF", Input: '"\n\n\n\n"', Output: [["\n\n\n\n"]], - ignore: true, }, { Name: "MultipleCRLF", Input: "\r\n\r\n\r\n\r\n", - ignore: true, + Output: [], }, /** * The implementation may read each line in several chunks if @@ -392,12 +385,12 @@ x,,, "#ignore\n".repeat(10000) + "@".repeat(5000) + "," + "*".repeat(5000), Output: [["@".repeat(5000), "*".repeat(5000)]], Comment: "#", - ignore: true, + ignore: true, // TODO(#4521) }, { Name: "QuoteWithTrailingCRLF", Input: '"foo"bar"\r\n', - Error: ErrBareQuote, + Error: ERR_QUOTE, // Error: &ParseError{StartLine: 1, Line: 1, Column: 4, Err: ErrQuote}, }, { @@ -410,58 +403,54 @@ x,,, Name: "DoubleQuoteWithTrailingCRLF", Input: '"foo""bar"\r\n', Output: [[`foo"bar`]], - ignore: true, }, { Name: "EvenQuotes", Input: `""""""""`, Output: [[`"""`]], - ignore: true, }, { Name: "OddQuotes", Input: `"""""""`, - Error: true, + Error: ERR_QUOTE, // Error:" &ParseError{StartLine: 1, Line: 1, Column: 7, Err: ErrQuote}", - ignore: true, }, { Name: "LazyOddQuotes", Input: `"""""""`, Output: [[`"""`]], LazyQuotes: true, - ignore: true, }, { Name: "BadComma1", Comma: "\n", - Error: ErrInvalidDelim, + Error: ERR_INVALID_DELIM, }, { Name: "BadComma2", Comma: "\r", - Error: ErrInvalidDelim, + Error: ERR_INVALID_DELIM, }, { Name: "BadComma3", Comma: '"', - Error: ErrInvalidDelim, + Error: ERR_INVALID_DELIM, }, { Name: "BadComment1", Comment: "\n", - Error: ErrInvalidDelim, + Error: ERR_INVALID_DELIM, }, { Name: "BadComment2", Comment: "\r", - Error: ErrInvalidDelim, + Error: ERR_INVALID_DELIM, }, { Name: "BadCommaComment", Comma: "X", Comment: "X", - Error: ErrInvalidDelim, + Error: ERR_INVALID_DELIM, }, ]; for (const t of testCases) {