fix(text): unicode support and word splitting according to case (#5447)

Co-authored-by: Yoshiya Hinosawa <stibium121@gmail.com>
Co-authored-by: Asher Gomez <ashersaupingomez@gmail.com>
This commit is contained in:
GuyBorderless 2024-07-22 14:45:05 +03:00 committed by GitHub
parent e1935ecc82
commit 97c5596f0b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 122 additions and 3 deletions

View File

@ -1,9 +1,18 @@
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
const CAPITALIZED_WORD_REGEXP = /\p{Lu}\p{Ll}+/u; // e.g. Apple
const ACRONYM_REGEXP = /\p{Lu}+(?=(\p{Lu}\p{Ll})|\P{L}|\b)/u; // e.g. ID, URL, handles an acronym followed by a capitalized word e.g. HTMLElement
const LOWERCASED_WORD_REGEXP = /(\p{Ll}+)/u; // e.g. apple
const ANY_LETTERS = /\p{L}+/u; // will match any sequence of letters, including in languages without a concept of upper/lower case
const DIGITS_REGEXP = /\p{N}+/u; // e.g. 123
const WORD_OR_NUMBER_REGEXP = new RegExp(
`${CAPITALIZED_WORD_REGEXP.source}|${ACRONYM_REGEXP.source}|${LOWERCASED_WORD_REGEXP.source}|${ANY_LETTERS.source}|${DIGITS_REGEXP.source}`,
"gu",
);
export function splitToWords(input: string) {
input = input.replaceAll(/[^a-zA-Z0-9\s-_]/g, "");
if (/[\s-_]+/.test(input)) return input.split(/[\s-_]+/);
return input.split(/(?=[A-Z])+/);
return input.match(WORD_OR_NUMBER_REGEXP) || [];
}
export function capitalizeWord(word: string): string {

View File

@ -3,6 +3,33 @@
import { assertEquals } from "@std/assert";
import { splitToWords } from "./_util.ts";
Deno.test({
name: "split() returns an empty array for an empty string",
fn() {
const result = splitToWords("");
assertEquals(result.length, 0);
},
});
Deno.test({
name:
"split() returns an empty array when input has no alphanumeric characters",
fn() {
const result = splitToWords("🦕♥️ 🦕♥️ 🦕♥️");
assertEquals(result.length, 0);
},
});
Deno.test({
name: "split() ignores non-alphanumeric characters mixed with words",
fn() {
const result = splitToWords("🦕deno♥wuv");
const expected = ["deno", "wuv"];
assertEquals(result, expected);
},
});
Deno.test({
name: "split() handles whitespace",
fn() {
@ -12,6 +39,15 @@ Deno.test({
},
});
Deno.test({
name: "split() handles whitespace at string end and start",
fn() {
const result = splitToWords(" deno Is AWESOME ");
const expected = ["deno", "Is", "AWESOME"];
assertEquals(result, expected);
},
});
Deno.test({
name: "split() handles mixed delimiters",
fn() {
@ -21,6 +57,15 @@ Deno.test({
},
});
Deno.test({
name: "split() handles a delimiter sequence",
fn() {
const result = splitToWords("I am -> thirsty!");
const expected = ["I", "am", "thirsty"];
assertEquals(result, expected);
},
});
Deno.test({
name: "split() handles upper case delimiter",
fn() {
@ -39,6 +84,42 @@ Deno.test({
},
});
Deno.test({
name: "split() handles casing",
fn() {
const result = splitToWords("denoIsAwesome");
const expected = ["deno", "Is", "Awesome"];
assertEquals(result, expected);
},
});
Deno.test({
name: "split() handles unicode",
fn() {
const result = splitToWords("шруберри IsAwesome");
const expected = ["шруберри", "Is", "Awesome"];
assertEquals(result, expected);
},
});
Deno.test({
name: "split() handles unicode casing",
fn() {
const result = splitToWords("шруберриШруберри");
const expected = ["шруберри", "Шруберри"];
assertEquals(result, expected);
},
});
Deno.test({
name: "split() handles languages without casing",
fn() {
const result = splitToWords(ין_על דינו");
const expected = ["אין", "על", "דינו"];
assertEquals(result, expected);
},
});
Deno.test({
name: "split() handles screaming snake case",
fn() {
@ -48,6 +129,15 @@ Deno.test({
},
});
Deno.test({
name: "split() handles acronym followed by a capitalized word",
fn() {
const result = splitToWords("I Love HTMLDivElement");
const expected = ["I", "Love", "HTML", "Div", "Element"];
assertEquals(result, expected);
},
});
Deno.test({
name: "split() handles underscore delimiter",
fn() {
@ -56,3 +146,12 @@ Deno.test({
assertEquals(result, expected);
},
});
Deno.test({
name: "split() handles acronym followed by a capitalized word",
fn() {
const result = splitToWords("I Love HTMLDivElement");
const expected = ["I", "Love", "HTML", "Div", "Element"];
assertEquals(result, expected);
},
});

View File

@ -93,6 +93,12 @@ Deno.test("toPascalCase() trims whitespace", () => {
assertEquals(result, expected);
});
Deno.test("toPascalCase() converts a single word with Cyrillic letters", () => {
const input = "шруберри";
const expected = "Шруберри";
assertEquals(toPascalCase(input), expected);
});
Deno.test("toSnakeCase() handles an empty string", () => {
assertEquals(toSnakeCase(""), "");
});
@ -121,6 +127,11 @@ Deno.test("toSnakeCase() trims whitespace", () => {
assertEquals(result, expected);
});
Deno.test("toSnakeCase() splits words before and after the numbers", () => {
assertEquals(toSnakeCase("str2Num"), "str_2_num");
assertEquals(toSnakeCase("Str2Num"), "str_2_num");
});
Deno.test("toConstantCase() converts a single word", () => {
const input = "shruberry";
const expected = "SHRUBERRY";