diff --git a/text/_util.ts b/text/_util.ts index e1d40cbce..9b6b21876 100644 --- a/text/_util.ts +++ b/text/_util.ts @@ -1,9 +1,18 @@ // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license. +const CAPITALIZED_WORD_REGEXP = /\p{Lu}\p{Ll}+/u; // e.g. Apple +const ACRONYM_REGEXP = /\p{Lu}+(?=(\p{Lu}\p{Ll})|\P{L}|\b)/u; // e.g. ID, URL, handles an acronym followed by a capitalized word e.g. HTMLElement +const LOWERCASED_WORD_REGEXP = /(\p{Ll}+)/u; // e.g. apple +const ANY_LETTERS = /\p{L}+/u; // will match any sequence of letters, including in languages without a concept of upper/lower case +const DIGITS_REGEXP = /\p{N}+/u; // e.g. 123 + +const WORD_OR_NUMBER_REGEXP = new RegExp( + `${CAPITALIZED_WORD_REGEXP.source}|${ACRONYM_REGEXP.source}|${LOWERCASED_WORD_REGEXP.source}|${ANY_LETTERS.source}|${DIGITS_REGEXP.source}`, + "gu", +); + export function splitToWords(input: string) { - input = input.replaceAll(/[^a-zA-Z0-9\s-_]/g, ""); - if (/[\s-_]+/.test(input)) return input.split(/[\s-_]+/); - return input.split(/(?=[A-Z])+/); + return input.match(WORD_OR_NUMBER_REGEXP) || []; } export function capitalizeWord(word: string): string { diff --git a/text/_util_test.ts b/text/_util_test.ts index d64a410e2..155174d0f 100644 --- a/text/_util_test.ts +++ b/text/_util_test.ts @@ -3,6 +3,33 @@ import { assertEquals } from "@std/assert"; import { splitToWords } from "./_util.ts"; +Deno.test({ + name: "split() returns an empty array for an empty string", + fn() { + const result = splitToWords(""); + assertEquals(result.length, 0); + }, +}); + +Deno.test({ + name: + "split() returns an empty array when input has no alphanumeric characters", + fn() { + const result = splitToWords("🦕♥️ 🦕♥️ 🦕♥️"); + assertEquals(result.length, 0); + }, +}); + +Deno.test({ + name: "split() ignores non-alphanumeric characters mixed with words", + fn() { + const result = splitToWords("🦕deno♥️wuv"); + const expected = ["deno", "wuv"]; + + assertEquals(result, expected); + }, +}); + Deno.test({ name: "split() handles whitespace", fn() { @@ -12,6 +39,15 @@ Deno.test({ }, }); +Deno.test({ + name: "split() handles whitespace at string end and start", + fn() { + const result = splitToWords(" deno Is AWESOME "); + const expected = ["deno", "Is", "AWESOME"]; + assertEquals(result, expected); + }, +}); + Deno.test({ name: "split() handles mixed delimiters", fn() { @@ -21,6 +57,15 @@ Deno.test({ }, }); +Deno.test({ + name: "split() handles a delimiter sequence", + fn() { + const result = splitToWords("I am -> thirsty!"); + const expected = ["I", "am", "thirsty"]; + assertEquals(result, expected); + }, +}); + Deno.test({ name: "split() handles upper case delimiter", fn() { @@ -39,6 +84,42 @@ Deno.test({ }, }); +Deno.test({ + name: "split() handles casing", + fn() { + const result = splitToWords("denoIsAwesome"); + const expected = ["deno", "Is", "Awesome"]; + assertEquals(result, expected); + }, +}); + +Deno.test({ + name: "split() handles unicode", + fn() { + const result = splitToWords("шруберри IsAwesome"); + const expected = ["шруберри", "Is", "Awesome"]; + assertEquals(result, expected); + }, +}); + +Deno.test({ + name: "split() handles unicode casing", + fn() { + const result = splitToWords("шруберриШруберри"); + const expected = ["шруберри", "Шруберри"]; + assertEquals(result, expected); + }, +}); + +Deno.test({ + name: "split() handles languages without casing", + fn() { + const result = splitToWords("אין_על דינו"); + const expected = ["אין", "על", "דינו"]; + assertEquals(result, expected); + }, +}); + Deno.test({ name: "split() handles screaming snake case", fn() { @@ -48,6 +129,15 @@ Deno.test({ }, }); +Deno.test({ + name: "split() handles acronym followed by a capitalized word", + fn() { + const result = splitToWords("I Love HTMLDivElement"); + const expected = ["I", "Love", "HTML", "Div", "Element"]; + assertEquals(result, expected); + }, +}); + Deno.test({ name: "split() handles underscore delimiter", fn() { @@ -56,3 +146,12 @@ Deno.test({ assertEquals(result, expected); }, }); + +Deno.test({ + name: "split() handles acronym followed by a capitalized word", + fn() { + const result = splitToWords("I Love HTMLDivElement"); + const expected = ["I", "Love", "HTML", "Div", "Element"]; + assertEquals(result, expected); + }, +}); diff --git a/text/case_test.ts b/text/case_test.ts index a69e5cc06..1f7e0ffb8 100644 --- a/text/case_test.ts +++ b/text/case_test.ts @@ -93,6 +93,12 @@ Deno.test("toPascalCase() trims whitespace", () => { assertEquals(result, expected); }); +Deno.test("toPascalCase() converts a single word with Cyrillic letters", () => { + const input = "шруберри"; + const expected = "Шруберри"; + assertEquals(toPascalCase(input), expected); +}); + Deno.test("toSnakeCase() handles an empty string", () => { assertEquals(toSnakeCase(""), ""); }); @@ -121,6 +127,11 @@ Deno.test("toSnakeCase() trims whitespace", () => { assertEquals(result, expected); }); +Deno.test("toSnakeCase() splits words before and after the numbers", () => { + assertEquals(toSnakeCase("str2Num"), "str_2_num"); + assertEquals(toSnakeCase("Str2Num"), "str_2_num"); +}); + Deno.test("toConstantCase() converts a single word", () => { const input = "shruberry"; const expected = "SHRUBERRY";