refactor(internal): cleanup and restructure diffstr() (#4703)

* refactor(internal): cleanup and restructure `diffstr()` * work
2024-11-22 04:59:05 +00:00 · 2024-05-09 20:36:45 +10:00 · 2024-05-09 20:36:45 +10:00 · c53ca87301
commit c53ca87301
parent 54f93b8f94
1 changed files with 99 additions and 83 deletions
--- a/internal/diff_str.ts
+++ b/internal/diff_str.ts
@ -3,35 +3,42 @@ import type { DiffResult } from "./_types.ts";
 import { diff } from "./diff.ts";

 /**
- * Renders the differences between the actual and expected strings
- * Partially inspired from https://github.com/kpdecker/jsdiff
- * @param A Actual string
- * @param B Expected string
+ * Unescape invisible characters.
+ *
+ * @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences}
+ *
+ * @param string String to unescape.
+ *
+ * @returns Unescaped string.
 */
-export function diffstr(A: string, B: string): DiffResult<string>[] {
-  function unescape(string: string): string {
-    // unescape invisible characters.
-    // ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#escape_sequences
+function unescape(string: string): string {
  return string
    .replaceAll("\b", "\\b")
    .replaceAll("\f", "\\f")
    .replaceAll("\t", "\\t")
    .replaceAll("\v", "\\v")
-      .replaceAll( // does not remove line breaks
+    // This does not remove line breaks
+    .replaceAll(
      /\r\n|\r|\n/g,
      (str) => str === "\r" ? "\\r" : str === "\n" ? "\\n\n" : "\\r\\n\r\n",
    );
-  }
+}

-  function tokenize(string: string, { wordDiff = false } = {}): string[] {
-    if (wordDiff) {
-      // Split string on whitespace symbols
-      const tokens = string.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);
-      // Extended Latin character set
-      const words =
+const WHITESPACE_SYMBOLS = /([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/;
+const EXT_LATIN_CHARS =
  /^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;

-      // Join boundary splits that we do not consider to be boundaries and merge empty strings surrounded by word chars
+/**
+ * Tokenizes a string into an array of tokens.
+ *
+ * @param string The string to tokenize.
+ * @param wordDiff If true, performs word-based tokenization. Default is false.
+ *
+ * @returns An array of tokens.
+ */
+function tokenize(string: string, wordDiff = false): string[] {
+  if (wordDiff) {
+    const tokens = string.split(WHITESPACE_SYMBOLS).filter((token) => token);
    for (let i = 0; i < tokens.length - 1; i++) {
      const token = tokens[i];
      const tokenPlusTwo = tokens[i + 2];
@ -39,26 +46,19 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
        !tokens[i + 1] &&
        token &&
        tokenPlusTwo &&
-          words.test(token) &&
-          words.test(tokenPlusTwo)
+        EXT_LATIN_CHARS.test(token) &&
+        EXT_LATIN_CHARS.test(tokenPlusTwo)
      ) {
        tokens[i] += tokenPlusTwo;
        tokens.splice(i + 1, 2);
        i--;
      }
    }
-      return tokens.filter((token) => token);
-    } else {
-      // Split string on new lines symbols
-      const tokens: string[] = [];
-      const lines = string.split(/(\n|\r\n)/);
-
-      // Ignore final empty token when text ends with a newline
-      if (!lines[lines.length - 1]) {
-        lines.pop();
+    return tokens;
  }
+  const tokens: string[] = [];
+  const lines = string.split(/(\n|\r\n)/).filter((line) => line);

-      // Merge the content and line separators into single tokens
  for (const [i, line] of lines.entries()) {
    if (i % 2) {
      tokens[tokens.length - 1] += line;
@ -67,15 +67,21 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
    }
  }
  return tokens;
-    }
-  }
+}

-  // Create details by filtering relevant word-diff for current line
-  // and merge "space-diff" if surrounded by word-diff for cleaner displays
-  function createDetails(
+/**
+ * Create details by filtering relevant word-diff for current line and merge
+ * "space-diff" if surrounded by word-diff for cleaner displays.
+ *
+ * @param line Current line
+ * @param tokens Word-diff tokens
+ *
+ * @returns Array of diff results.
+ */
+function createDetails(
  line: DiffResult<string>,
  tokens: Array<DiffResult<string>>,
-  ) {
+) {
  return tokens.filter(({ type }) => type === line.type || type === "common")
    .map((result, i, t) => {
      const token = t[i - 1];
@ -90,8 +96,18 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
      }
      return result;
    });
-  }
+}

+/**
+ * Renders the differences between the actual and expected strings. Partially
+ * inspired from {@link https://github.com/kpdecker/jsdiff}.
+ *
+ * @param A Actual string
+ * @param B Expected string
+ *
+ * @returns Array of diff results.
+ */
+export function diffstr(A: string, B: string): DiffResult<string>[] {
  // Compute multi-line diff
  const diffResult = diff(
    tokenize(`${unescape(A)}\n`),
@ -120,8 +136,8 @@ export function diffstr(A: string, B: string): DiffResult<string>[] {
    while (bLines.length) {
      b = bLines.shift();
      const tokenized = [
-        tokenize(a.value, { wordDiff: true }),
-        tokenize(b?.value ?? "", { wordDiff: true }),
+        tokenize(a.value, true),
+        tokenize(b?.value ?? "", true),
      ] as [string[], string[]];
      if (hasMoreRemovedLines) tokenized.reverse();
      tokens = diff(tokenized[0], tokenized[1]);