mirror of
https://github.com/denoland/std.git
synced 2024-11-21 20:50:22 +00:00
refactor(internal): cleanup and restructure diffstr()
(#4703)
* refactor(internal): cleanup and restructure `diffstr()` * work
This commit is contained in:
parent
54f93b8f94
commit
c53ca87301
@ -3,95 +3,111 @@ import type { DiffResult } from "./_types.ts";
|
||||
import { diff } from "./diff.ts";
|
||||
|
||||
/**
|
||||
* Renders the differences between the actual and expected strings
|
||||
* Partially inspired from https://github.com/kpdecker/jsdiff
|
||||
* @param A Actual string
|
||||
* @param B Expected string
|
||||
* Unescape invisible characters.
|
||||
*
|
||||
* @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences}
|
||||
*
|
||||
* @param string String to unescape.
|
||||
*
|
||||
* @returns Unescaped string.
|
||||
*/
|
||||
export function diffstr(A: string, B: string): DiffResult<string>[] {
|
||||
function unescape(string: string): string {
|
||||
// unescape invisible characters.
|
||||
// ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences
|
||||
return string
|
||||
.replaceAll("\b", "\\b")
|
||||
.replaceAll("\f", "\\f")
|
||||
.replaceAll("\t", "\\t")
|
||||
.replaceAll("\v", "\\v")
|
||||
.replaceAll( // does not remove line breaks
|
||||
/\r\n|\r|\n/g,
|
||||
(str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
|
||||
);
|
||||
function unescape(string: string): string {
|
||||
return string
|
||||
.replaceAll("\b", "\\b")
|
||||
.replaceAll("\f", "\\f")
|
||||
.replaceAll("\t", "\\t")
|
||||
.replaceAll("\v", "\\v")
|
||||
// This does not remove line breaks
|
||||
.replaceAll(
|
||||
/\r\n|\r|\n/g,
|
||||
(str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
|
||||
);
|
||||
}
|
||||
|
||||
const WHITESPACE_SYMBOLS = /([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/;
|
||||
const EXT_LATIN_CHARS =
|
||||
/^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
|
||||
|
||||
/**
|
||||
* Tokenizes a string into an array of tokens.
|
||||
*
|
||||
* @param string The string to tokenize.
|
||||
* @param wordDiff If true, performs word-based tokenization. Default is false.
|
||||
*
|
||||
* @returns An array of tokens.
|
||||
*/
|
||||
function tokenize(string: string, wordDiff = false): string[] {
|
||||
if (wordDiff) {
|
||||
const tokens = string.split(WHITESPACE_SYMBOLS).filter((token) => token);
|
||||
for (let i = 0; i < tokens.length - 1; i++) {
|
||||
const token = tokens[i];
|
||||
const tokenPlusTwo = tokens[i + 2];
|
||||
if (
|
||||
!tokens[i + 1] &&
|
||||
token &&
|
||||
tokenPlusTwo &&
|
||||
EXT_LATIN_CHARS.test(token) &&
|
||||
EXT_LATIN_CHARS.test(tokenPlusTwo)
|
||||
) {
|
||||
tokens[i] += tokenPlusTwo;
|
||||
tokens.splice(i + 1, 2);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
const tokens: string[] = [];
|
||||
const lines = string.split(/(\n|\r\n)/).filter((line) => line);
|
||||
|
||||
function tokenize(string: string, { wordDiff = false } = {}): string[] {
|
||||
if (wordDiff) {
|
||||
// Split string on whitespace symbols
|
||||
const tokens = string.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);
|
||||
// Extended Latin character set
|
||||
const words =
|
||||
/^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
|
||||
|
||||
// Join boundary splits that we do not consider to be boundaries and merge empty strings surrounded by word chars
|
||||
for (let i = 0; i < tokens.length - 1; i++) {
|
||||
const token = tokens[i];
|
||||
const tokenPlusTwo = tokens[i + 2];
|
||||
if (
|
||||
!tokens[i + 1] &&
|
||||
token &&
|
||||
tokenPlusTwo &&
|
||||
words.test(token) &&
|
||||
words.test(tokenPlusTwo)
|
||||
) {
|
||||
tokens[i] += tokenPlusTwo;
|
||||
tokens.splice(i + 1, 2);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
return tokens.filter((token) => token);
|
||||
for (const [i, line] of lines.entries()) {
|
||||
if (i % 2) {
|
||||
tokens[tokens.length - 1] += line;
|
||||
} else {
|
||||
// Split string on new lines symbols
|
||||
const tokens: string[] = [];
|
||||
const lines = string.split(/(\n|\r\n)/);
|
||||
|
||||
// Ignore final empty token when text ends with a newline
|
||||
if (!lines[lines.length - 1]) {
|
||||
lines.pop();
|
||||
}
|
||||
|
||||
// Merge the content and line separators into single tokens
|
||||
for (const [i, line] of lines.entries()) {
|
||||
if (i % 2) {
|
||||
tokens[tokens.length - 1] += line;
|
||||
} else {
|
||||
tokens.push(line);
|
||||
}
|
||||
}
|
||||
return tokens;
|
||||
tokens.push(line);
|
||||
}
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
// Create details by filtering relevant word-diff for current line
|
||||
// and merge "space-diff" if surrounded by word-diff for cleaner displays
|
||||
function createDetails(
|
||||
line: DiffResult<string>,
|
||||
tokens: Array<DiffResult<string>>,
|
||||
) {
|
||||
return tokens.filter(({ type }) => type === line.type || type === "common")
|
||||
.map((result, i, t) => {
|
||||
const token = t[i - 1];
|
||||
if (
|
||||
(result.type === "common") && token &&
|
||||
(token.type === t[i + 1]?.type) && /\s+/.test(result.value)
|
||||
) {
|
||||
return {
|
||||
...result,
|
||||
type: token.type,
|
||||
};
|
||||
}
|
||||
return result;
|
||||
});
|
||||
}
|
||||
/**
|
||||
* Create details by filtering relevant word-diff for current line and merge
|
||||
* "space-diff" if surrounded by word-diff for cleaner displays.
|
||||
*
|
||||
* @param line Current line
|
||||
* @param tokens Word-diff tokens
|
||||
*
|
||||
* @returns Array of diff results.
|
||||
*/
|
||||
function createDetails(
|
||||
line: DiffResult<string>,
|
||||
tokens: Array<DiffResult<string>>,
|
||||
) {
|
||||
return tokens.filter(({ type }) => type === line.type || type === "common")
|
||||
.map((result, i, t) => {
|
||||
const token = t[i - 1];
|
||||
if (
|
||||
(result.type === "common") && token &&
|
||||
(token.type === t[i + 1]?.type) && /\s+/.test(result.value)
|
||||
) {
|
||||
return {
|
||||
...result,
|
||||
type: token.type,
|
||||
};
|
||||
}
|
||||
return result;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Renders the differences between the actual and expected strings. Partially
|
||||
* inspired from {@link https://github.com/kpdecker/jsdiff}.
|
||||
*
|
||||
* @param A Actual string
|
||||
* @param B Expected string
|
||||
*
|
||||
* @returns Array of diff results.
|
||||
*/
|
||||
export function diffstr(A: string, B: string): DiffResult<string>[] {
|
||||
// Compute multi-line diff
|
||||
const diffResult = diff(
|
||||
tokenize(`${unescape(A)}\n`),
|
||||
@ -120,8 +136,8 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
|
||||
while (bLines.length) {
|
||||
b = bLines.shift();
|
||||
const tokenized = [
|
||||
tokenize(a.value, { wordDiff: true }),
|
||||
tokenize(b?.value ?? "", { wordDiff: true }),
|
||||
tokenize(a.value, true),
|
||||
tokenize(b?.value ?? "", true),
|
||||
] as [string[], string[]];
|
||||
if (hasMoreRemovedLines) tokenized.reverse();
|
||||
tokens = diff(tokenized[0], tokenized[1]);
|
||||
|
Loading…
Reference in New Issue
Block a user