refactor(internal): cleanup and restructure diffstr() (#4703)

* refactor(internal): cleanup and restructure `diffstr()`

* work
This commit is contained in:
Asher Gomez 2024-05-09 20:36:45 +10:00 committed by GitHub
parent 54f93b8f94
commit c53ca87301
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -3,95 +3,111 @@ import type { DiffResult } from "./_types.ts";
import { diff } from "./diff.ts";
/**
* Renders the differences between the actual and expected strings
* Partially inspired from https://github.com/kpdecker/jsdiff
* @param A Actual string
* @param B Expected string
* Unescape invisible characters.
*
* @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences}
*
* @param string String to unescape.
*
* @returns Unescaped string.
*/
export function diffstr(A: string, B: string): DiffResult<string>[] {
function unescape(string: string): string {
// unescape invisible characters.
// ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences
return string
.replaceAll("\b", "\\b")
.replaceAll("\f", "\\f")
.replaceAll("\t", "\\t")
.replaceAll("\v", "\\v")
.replaceAll( // does not remove line breaks
/\r\n|\r|\n/g,
(str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
);
function unescape(string: string): string {
return string
.replaceAll("\b", "\\b")
.replaceAll("\f", "\\f")
.replaceAll("\t", "\\t")
.replaceAll("\v", "\\v")
// This does not remove line breaks
.replaceAll(
/\r\n|\r|\n/g,
(str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
);
}
const WHITESPACE_SYMBOLS = /([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/;
const EXT_LATIN_CHARS =
/^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
/**
* Tokenizes a string into an array of tokens.
*
* @param string The string to tokenize.
* @param wordDiff If true, performs word-based tokenization. Default is false.
*
* @returns An array of tokens.
*/
function tokenize(string: string, wordDiff = false): string[] {
if (wordDiff) {
const tokens = string.split(WHITESPACE_SYMBOLS).filter((token) => token);
for (let i = 0; i < tokens.length - 1; i++) {
const token = tokens[i];
const tokenPlusTwo = tokens[i + 2];
if (
!tokens[i + 1] &&
token &&
tokenPlusTwo &&
EXT_LATIN_CHARS.test(token) &&
EXT_LATIN_CHARS.test(tokenPlusTwo)
) {
tokens[i] += tokenPlusTwo;
tokens.splice(i + 1, 2);
i--;
}
}
return tokens;
}
const tokens: string[] = [];
const lines = string.split(/(\n|\r\n)/).filter((line) => line);
function tokenize(string: string, { wordDiff = false } = {}): string[] {
if (wordDiff) {
// Split string on whitespace symbols
const tokens = string.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);
// Extended Latin character set
const words =
/^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
// Join boundary splits that we do not consider to be boundaries and merge empty strings surrounded by word chars
for (let i = 0; i < tokens.length - 1; i++) {
const token = tokens[i];
const tokenPlusTwo = tokens[i + 2];
if (
!tokens[i + 1] &&
token &&
tokenPlusTwo &&
words.test(token) &&
words.test(tokenPlusTwo)
) {
tokens[i] += tokenPlusTwo;
tokens.splice(i + 1, 2);
i--;
}
}
return tokens.filter((token) => token);
for (const [i, line] of lines.entries()) {
if (i % 2) {
tokens[tokens.length - 1] += line;
} else {
// Split string on new lines symbols
const tokens: string[] = [];
const lines = string.split(/(\n|\r\n)/);
// Ignore final empty token when text ends with a newline
if (!lines[lines.length - 1]) {
lines.pop();
}
// Merge the content and line separators into single tokens
for (const [i, line] of lines.entries()) {
if (i % 2) {
tokens[tokens.length - 1] += line;
} else {
tokens.push(line);
}
}
return tokens;
tokens.push(line);
}
}
return tokens;
}
// Create details by filtering relevant word-diff for current line
// and merge "space-diff" if surrounded by word-diff for cleaner displays
function createDetails(
line: DiffResult<string>,
tokens: Array<DiffResult<string>>,
) {
return tokens.filter(({ type }) => type === line.type || type === "common")
.map((result, i, t) => {
const token = t[i - 1];
if (
(result.type === "common") && token &&
(token.type === t[i + 1]?.type) && /\s+/.test(result.value)
) {
return {
...result,
type: token.type,
};
}
return result;
});
}
/**
* Create details by filtering relevant word-diff for current line and merge
* "space-diff" if surrounded by word-diff for cleaner displays.
*
* @param line Current line
* @param tokens Word-diff tokens
*
* @returns Array of diff results.
*/
function createDetails(
line: DiffResult<string>,
tokens: Array<DiffResult<string>>,
) {
return tokens.filter(({ type }) => type === line.type || type === "common")
.map((result, i, t) => {
const token = t[i - 1];
if (
(result.type === "common") && token &&
(token.type === t[i + 1]?.type) && /\s+/.test(result.value)
) {
return {
...result,
type: token.type,
};
}
return result;
});
}
/**
* Renders the differences between the actual and expected strings. Partially
* inspired from {@link https://github.com/kpdecker/jsdiff}.
*
* @param A Actual string
* @param B Expected string
*
* @returns Array of diff results.
*/
export function diffstr(A: string, B: string): DiffResult<string>[] {
// Compute multi-line diff
const diffResult = diff(
tokenize(`${unescape(A)}\n`),
@ -120,8 +136,8 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
while (bLines.length) {
b = bLines.shift();
const tokenized = [
tokenize(a.value, { wordDiff: true }),
tokenize(b?.value ?? "", { wordDiff: true }),
tokenize(a.value, true),
tokenize(b?.value ?? "", true),
] as [string[], string[]];
if (hasMoreRemovedLines) tokenized.reverse();
tokens = diff(tokenized[0], tokenized[1]);