refactor(internal): cleanup and restructure diffstr() (#4703)

* refactor(internal): cleanup and restructure `diffstr()` * work
2024-11-21 20:50:22 +00:00 · 2024-05-09 20:36:45 +10:00 · 2024-05-09 20:36:45 +10:00 · c53ca87301
commit c53ca87301
parent 54f93b8f94
1 changed files with 99 additions and 83 deletions
--- a/internal/diff_str.ts
+++ b/internal/diff_str.ts
@ -3,95 +3,111 @@ import type { DiffResult } from "./_types.ts";
 import { diff } from "./diff.ts";

 /**
- * Renders the differences between the actual and expected strings
- * Partially inspired from https://github.com/kpdecker/jsdiff
- * @param A Actual string
- * @param B Expected string
+ * Unescape invisible characters.
+ *
+ * @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences}
+ *
+ * @param string String to unescape.
+ *
+ * @returns Unescaped string.
 */
-export function diffstr(A: string, B: string): DiffResult<string>[] {
-  function unescape(string: string): string {
-    // unescape invisible characters.
-    // ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences
-    return string
-      .replaceAll("\b", "\\b")
-      .replaceAll("\f", "\\f")
-      .replaceAll("\t", "\\t")
-      .replaceAll("\v", "\\v")
-      .replaceAll( // does not remove line breaks
-        /\r\n|\r|\n/g,
-        (str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
-      );
+function unescape(string: string): string {
+  return string
+    .replaceAll("\b", "\\b")
+    .replaceAll("\f", "\\f")
+    .replaceAll("\t", "\\t")
+    .replaceAll("\v", "\\v")
+    // This does not remove line breaks
+    .replaceAll(
+      /\r\n|\r|\n/g,
+      (str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
+    );
+}
+
+const WHITESPACE_SYMBOLS = /([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/;
+const EXT_LATIN_CHARS =
+  /^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
+
+/**
+ * Tokenizes a string into an array of tokens.
+ *
+ * @param string The string to tokenize.
+ * @param wordDiff If true, performs word-based tokenization. Default is false.
+ *
+ * @returns An array of tokens.
+ */
+function tokenize(string: string, wordDiff = false): string[] {
+  if (wordDiff) {
+    const tokens = string.split(WHITESPACE_SYMBOLS).filter((token) => token);
+    for (let i = 0; i < tokens.length - 1; i++) {
+      const token = tokens[i];
+      const tokenPlusTwo = tokens[i + 2];
+      if (
+        !tokens[i + 1] &&
+        token &&
+        tokenPlusTwo &&
+        EXT_LATIN_CHARS.test(token) &&
+        EXT_LATIN_CHARS.test(tokenPlusTwo)
+      ) {
+        tokens[i] += tokenPlusTwo;
+        tokens.splice(i + 1, 2);
+        i--;
+      }
+    }
+    return tokens;
  }
+  const tokens: string[] = [];
+  const lines = string.split(/(\n|\r\n)/).filter((line) => line);

-  function tokenize(string: string, { wordDiff = false } = {}): string[] {
-    if (wordDiff) {
-      // Split string on whitespace symbols
-      const tokens = string.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);
-      // Extended Latin character set
-      const words =
-        /^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
-
-      // Join boundary splits that we do not consider to be boundaries and merge empty strings surrounded by word chars
-      for (let i = 0; i < tokens.length - 1; i++) {
-        const token = tokens[i];
-        const tokenPlusTwo = tokens[i + 2];
-        if (
-          !tokens[i + 1] &&
-          token &&
-          tokenPlusTwo &&
-          words.test(token) &&
-          words.test(tokenPlusTwo)
-        ) {
-          tokens[i] += tokenPlusTwo;
-          tokens.splice(i + 1, 2);
-          i--;
-        }
-      }
-      return tokens.filter((token) => token);
+  for (const [i, line] of lines.entries()) {
+    if (i % 2) {
+      tokens[tokens.length - 1] += line;
    } else {
-      // Split string on new lines symbols
-      const tokens: string[] = [];
-      const lines = string.split(/(\n|\r\n)/);
-
-      // Ignore final empty token when text ends with a newline
-      if (!lines[lines.length - 1]) {
-        lines.pop();
-      }
-
-      // Merge the content and line separators into single tokens
-      for (const [i, line] of lines.entries()) {
-        if (i % 2) {
-          tokens[tokens.length - 1] += line;
-        } else {
-          tokens.push(line);
-        }
-      }
-      return tokens;
+      tokens.push(line);
    }
  }
+  return tokens;
+}

-  // Create details by filtering relevant word-diff for current line
-  // and merge "space-diff" if surrounded by word-diff for cleaner displays
-  function createDetails(
-    line: DiffResult<string>,
-    tokens: Array<DiffResult<string>>,
-  ) {
-    return tokens.filter(({ type }) => type === line.type || type === "common")
-      .map((result, i, t) => {
-        const token = t[i - 1];
-        if (
-          (result.type === "common") && token &&
-          (token.type === t[i + 1]?.type) && /\s+/.test(result.value)
-        ) {
-          return {
-            ...result,
-            type: token.type,
-          };
-        }
-        return result;
-      });
-  }
+/**
+ * Create details by filtering relevant word-diff for current line and merge
+ * "space-diff" if surrounded by word-diff for cleaner displays.
+ *
+ * @param line Current line
+ * @param tokens Word-diff tokens
+ *
+ * @returns Array of diff results.
+ */
+function createDetails(
+  line: DiffResult<string>,
+  tokens: Array<DiffResult<string>>,
+) {
+  return tokens.filter(({ type }) => type === line.type || type === "common")
+    .map((result, i, t) => {
+      const token = t[i - 1];
+      if (
+        (result.type === "common") && token &&
+        (token.type === t[i + 1]?.type) && /\s+/.test(result.value)
+      ) {
+        return {
+          ...result,
+          type: token.type,
+        };
+      }
+      return result;
+    });
+}

+/**
+ * Renders the differences between the actual and expected strings. Partially
+ * inspired from {@link https://github.com/kpdecker/jsdiff}.
+ *
+ * @param A Actual string
+ * @param B Expected string
+ *
+ * @returns Array of diff results.
+ */
+export function diffstr(A: string, B: string): DiffResult<string>[] {
  // Compute multi-line diff
  const diffResult = diff(
    tokenize(`${unescape(A)}\n`),
@ -120,8 +136,8 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
    while (bLines.length) {
      b = bLines.shift();
      const tokenized = [
-        tokenize(a.value, { wordDiff: true }),
-        tokenize(b?.value ?? "", { wordDiff: true }),
+        tokenize(a.value, true),
+        tokenize(b?.value ?? "", true),
      ] as [string[], string[]];
      if (hasMoreRemovedLines) tokenized.reverse();
      tokens = diff(tokenized[0], tokenized[1]);