mirror of
https://github.com/denoland/std.git
synced 2024-11-21 20:50:22 +00:00
fix(text): unicode support and word splitting according to case (#5447)
Co-authored-by: Yoshiya Hinosawa <stibium121@gmail.com> Co-authored-by: Asher Gomez <ashersaupingomez@gmail.com>
This commit is contained in:
parent
e1935ecc82
commit
97c5596f0b
@ -1,9 +1,18 @@
|
||||
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
|
||||
|
||||
const CAPITALIZED_WORD_REGEXP = /\p{Lu}\p{Ll}+/u; // e.g. Apple
|
||||
const ACRONYM_REGEXP = /\p{Lu}+(?=(\p{Lu}\p{Ll})|\P{L}|\b)/u; // e.g. ID, URL, handles an acronym followed by a capitalized word e.g. HTMLElement
|
||||
const LOWERCASED_WORD_REGEXP = /(\p{Ll}+)/u; // e.g. apple
|
||||
const ANY_LETTERS = /\p{L}+/u; // will match any sequence of letters, including in languages without a concept of upper/lower case
|
||||
const DIGITS_REGEXP = /\p{N}+/u; // e.g. 123
|
||||
|
||||
const WORD_OR_NUMBER_REGEXP = new RegExp(
|
||||
`${CAPITALIZED_WORD_REGEXP.source}|${ACRONYM_REGEXP.source}|${LOWERCASED_WORD_REGEXP.source}|${ANY_LETTERS.source}|${DIGITS_REGEXP.source}`,
|
||||
"gu",
|
||||
);
|
||||
|
||||
export function splitToWords(input: string) {
|
||||
input = input.replaceAll(/[^a-zA-Z0-9\s-_]/g, "");
|
||||
if (/[\s-_]+/.test(input)) return input.split(/[\s-_]+/);
|
||||
return input.split(/(?=[A-Z])+/);
|
||||
return input.match(WORD_OR_NUMBER_REGEXP) || [];
|
||||
}
|
||||
|
||||
export function capitalizeWord(word: string): string {
|
||||
|
@ -3,6 +3,33 @@
|
||||
import { assertEquals } from "@std/assert";
|
||||
import { splitToWords } from "./_util.ts";
|
||||
|
||||
Deno.test({
|
||||
name: "split() returns an empty array for an empty string",
|
||||
fn() {
|
||||
const result = splitToWords("");
|
||||
assertEquals(result.length, 0);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name:
|
||||
"split() returns an empty array when input has no alphanumeric characters",
|
||||
fn() {
|
||||
const result = splitToWords("🦕♥️ 🦕♥️ 🦕♥️");
|
||||
assertEquals(result.length, 0);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() ignores non-alphanumeric characters mixed with words",
|
||||
fn() {
|
||||
const result = splitToWords("🦕deno♥️wuv");
|
||||
const expected = ["deno", "wuv"];
|
||||
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles whitespace",
|
||||
fn() {
|
||||
@ -12,6 +39,15 @@ Deno.test({
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles whitespace at string end and start",
|
||||
fn() {
|
||||
const result = splitToWords(" deno Is AWESOME ");
|
||||
const expected = ["deno", "Is", "AWESOME"];
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles mixed delimiters",
|
||||
fn() {
|
||||
@ -21,6 +57,15 @@ Deno.test({
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles a delimiter sequence",
|
||||
fn() {
|
||||
const result = splitToWords("I am -> thirsty!");
|
||||
const expected = ["I", "am", "thirsty"];
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles upper case delimiter",
|
||||
fn() {
|
||||
@ -39,6 +84,42 @@ Deno.test({
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles casing",
|
||||
fn() {
|
||||
const result = splitToWords("denoIsAwesome");
|
||||
const expected = ["deno", "Is", "Awesome"];
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles unicode",
|
||||
fn() {
|
||||
const result = splitToWords("шруберри IsAwesome");
|
||||
const expected = ["шруберри", "Is", "Awesome"];
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles unicode casing",
|
||||
fn() {
|
||||
const result = splitToWords("шруберриШруберри");
|
||||
const expected = ["шруберри", "Шруберри"];
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles languages without casing",
|
||||
fn() {
|
||||
const result = splitToWords("אין_על דינו");
|
||||
const expected = ["אין", "על", "דינו"];
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles screaming snake case",
|
||||
fn() {
|
||||
@ -48,6 +129,15 @@ Deno.test({
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles acronym followed by a capitalized word",
|
||||
fn() {
|
||||
const result = splitToWords("I Love HTMLDivElement");
|
||||
const expected = ["I", "Love", "HTML", "Div", "Element"];
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles underscore delimiter",
|
||||
fn() {
|
||||
@ -56,3 +146,12 @@ Deno.test({
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
||||
Deno.test({
|
||||
name: "split() handles acronym followed by a capitalized word",
|
||||
fn() {
|
||||
const result = splitToWords("I Love HTMLDivElement");
|
||||
const expected = ["I", "Love", "HTML", "Div", "Element"];
|
||||
assertEquals(result, expected);
|
||||
},
|
||||
});
|
||||
|
@ -93,6 +93,12 @@ Deno.test("toPascalCase() trims whitespace", () => {
|
||||
assertEquals(result, expected);
|
||||
});
|
||||
|
||||
Deno.test("toPascalCase() converts a single word with Cyrillic letters", () => {
|
||||
const input = "шруберри";
|
||||
const expected = "Шруберри";
|
||||
assertEquals(toPascalCase(input), expected);
|
||||
});
|
||||
|
||||
Deno.test("toSnakeCase() handles an empty string", () => {
|
||||
assertEquals(toSnakeCase(""), "");
|
||||
});
|
||||
@ -121,6 +127,11 @@ Deno.test("toSnakeCase() trims whitespace", () => {
|
||||
assertEquals(result, expected);
|
||||
});
|
||||
|
||||
Deno.test("toSnakeCase() splits words before and after the numbers", () => {
|
||||
assertEquals(toSnakeCase("str2Num"), "str_2_num");
|
||||
assertEquals(toSnakeCase("Str2Num"), "str_2_num");
|
||||
});
|
||||
|
||||
Deno.test("toConstantCase() converts a single word", () => {
|
||||
const input = "shruberry";
|
||||
const expected = "SHRUBERRY";
|
||||
|
Loading…
Reference in New Issue
Block a user