refactor(internal): cleanup and restructure diffstr() (#4703)

* refactor(internal): cleanup and restructure `diffstr()`

* work
This commit is contained in:
Asher Gomez 2024-05-09 20:36:45 +10:00 committed by GitHub
parent 54f93b8f94
commit c53ca87301
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -3,35 +3,42 @@ import type { DiffResult } from "./_types.ts";
import { diff } from "./diff.ts";
/**
* Renders the differences between the actual and expected strings
* Partially inspired from https://github.com/kpdecker/jsdiff
* @param A Actual string
* @param B Expected string
* Unescape invisible characters.
*
* @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences}
*
* @param string String to unescape.
*
* @returns Unescaped string.
*/
export function diffstr(A: string, B: string): DiffResult<string>[] {
function unescape(string: string): string {
// unescape invisible characters.
// ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences
function unescape(string: string): string {
return string
.replaceAll("\b", "\\b")
.replaceAll("\f", "\\f")
.replaceAll("\t", "\\t")
.replaceAll("\v", "\\v")
.replaceAll( // does not remove line breaks
// This does not remove line breaks
.replaceAll(
/\r\n|\r|\n/g,
(str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
);
}
}
function tokenize(string: string, { wordDiff = false } = {}): string[] {
if (wordDiff) {
// Split string on whitespace symbols
const tokens = string.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);
// Extended Latin character set
const words =
const WHITESPACE_SYMBOLS = /([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/;
const EXT_LATIN_CHARS =
/^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
// Join boundary splits that we do not consider to be boundaries and merge empty strings surrounded by word chars
/**
* Tokenizes a string into an array of tokens.
*
* @param string The string to tokenize.
* @param wordDiff If true, performs word-based tokenization. Default is false.
*
* @returns An array of tokens.
*/
function tokenize(string: string, wordDiff = false): string[] {
if (wordDiff) {
const tokens = string.split(WHITESPACE_SYMBOLS).filter((token) => token);
for (let i = 0; i < tokens.length - 1; i++) {
const token = tokens[i];
const tokenPlusTwo = tokens[i + 2];
@ -39,26 +46,19 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
!tokens[i + 1] &&
token &&
tokenPlusTwo &&
words.test(token) &&
words.test(tokenPlusTwo)
EXT_LATIN_CHARS.test(token) &&
EXT_LATIN_CHARS.test(tokenPlusTwo)
) {
tokens[i] += tokenPlusTwo;
tokens.splice(i + 1, 2);
i--;
}
}
return tokens.filter((token) => token);
} else {
// Split string on new lines symbols
const tokens: string[] = [];
const lines = string.split(/(\n|\r\n)/);
// Ignore final empty token when text ends with a newline
if (!lines[lines.length - 1]) {
lines.pop();
return tokens;
}
const tokens: string[] = [];
const lines = string.split(/(\n|\r\n)/).filter((line) => line);
// Merge the content and line separators into single tokens
for (const [i, line] of lines.entries()) {
if (i % 2) {
tokens[tokens.length - 1] += line;
@ -67,15 +67,21 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
}
}
return tokens;
}
}
}
// Create details by filtering relevant word-diff for current line
// and merge "space-diff" if surrounded by word-diff for cleaner displays
function createDetails(
/**
* Create details by filtering relevant word-diff for current line and merge
* "space-diff" if surrounded by word-diff for cleaner displays.
*
* @param line Current line
* @param tokens Word-diff tokens
*
* @returns Array of diff results.
*/
function createDetails(
line: DiffResult<string>,
tokens: Array<DiffResult<string>>,
) {
) {
return tokens.filter(({ type }) => type === line.type || type === "common")
.map((result, i, t) => {
const token = t[i - 1];
@ -90,8 +96,18 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
}
return result;
});
}
}
/**
* Renders the differences between the actual and expected strings. Partially
* inspired from {@link https://github.com/kpdecker/jsdiff}.
*
* @param A Actual string
* @param B Expected string
*
* @returns Array of diff results.
*/
export function diffstr(A: string, B: string): DiffResult<string>[] {
// Compute multi-line diff
const diffResult = diff(
tokenize(`${unescape(A)}\n`),
@ -120,8 +136,8 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
while (bLines.length) {
b = bLines.shift();
const tokenized = [
tokenize(a.value, { wordDiff: true }),
tokenize(b?.value ?? "", { wordDiff: true }),
tokenize(a.value, true),
tokenize(b?.value ?? "", true),
] as [string[], string[]];
if (hasMoreRemovedLines) tokenized.reverse();
tokens = diff(tokenized[0], tokenized[1]);