mirror of
https://github.com/denoland/std.git
synced 2024-11-21 12:40:03 +00:00
feat(html): add escape and unescape functions for HTML entities (#3335)
This commit is contained in:
parent
5199824fca
commit
6ab64b1907
17
html/_tools/generate_data.ts
Executable file
17
html/_tools/generate_data.ts
Executable file
@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write
|
||||
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
|
||||
|
||||
// JSON version of the full canonical list of named HTML entities
|
||||
// https://html.spec.whatwg.org/multipage/named-characters.html
|
||||
import entityList from "https://html.spec.whatwg.org/entities.json" assert {
|
||||
type: "json",
|
||||
};
|
||||
|
||||
const data = Object.fromEntries(
|
||||
Object.entries(entityList).map(([k, v]) => [k, v.characters]),
|
||||
);
|
||||
|
||||
await Deno.writeTextFile(
|
||||
new URL(import.meta.resolve("../named_entity_list.json")),
|
||||
JSON.stringify(data, null, 2) + "\n",
|
||||
);
|
104
html/entities.ts
Normal file
104
html/entities.ts
Normal file
@ -0,0 +1,104 @@
|
||||
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
|
||||
// This module is browser compatible.
|
||||
|
||||
export type EntityList = Record<string, string>;
|
||||
|
||||
const rawToEntityEntries = [
|
||||
["&", "&"],
|
||||
["<", "<"],
|
||||
[">", ">"],
|
||||
['"', """],
|
||||
["'", "'"],
|
||||
] as const;
|
||||
|
||||
const defaultEntityList: EntityList = Object.fromEntries([
|
||||
...rawToEntityEntries.map(([raw, entity]) => [entity, raw]),
|
||||
["'", "'"],
|
||||
[" ", "\xa0"],
|
||||
]);
|
||||
|
||||
const rawToEntity = new Map<string, string>(rawToEntityEntries);
|
||||
|
||||
const rawRe = new RegExp(`[${[...rawToEntity.keys()].join("")}]`, "g");
|
||||
|
||||
/**
|
||||
* Escapes text for safe interpolation into HTML text content and quoted attributes
|
||||
*
|
||||
* @example
|
||||
* ```ts
|
||||
* import { escape } from "https://deno.land/std@$STD_VERSION/html/entities.ts";
|
||||
* import { assertEquals } from "https://deno.land/std@$STD_VERSION/testing/asserts.ts";
|
||||
*
|
||||
* assertEquals(escape("<>'&AA"), "<>'&AA");
|
||||
*
|
||||
* // characters that don't need to be escaped will be left alone,
|
||||
* // even if named HTML entities exist for them
|
||||
* assertEquals(escape("þð"), "þð");
|
||||
* ```
|
||||
*/
|
||||
export function escape(str: string) {
|
||||
return str.replaceAll(rawRe, (m) => rawToEntity.get(m)!);
|
||||
}
|
||||
|
||||
export type UnescapeOptions = { entityList: EntityList };
|
||||
|
||||
const defaultUnescapeOptions: UnescapeOptions = {
|
||||
entityList: defaultEntityList,
|
||||
};
|
||||
|
||||
const MAX_CODE_POINT = 0x10ffff;
|
||||
|
||||
const RX_DEC_ENTITY = /&#([0-9]+);/g;
|
||||
const RX_HEX_ENTITY = /&#x(\p{AHex}+);/gu;
|
||||
|
||||
const entityListRegexCache = new WeakMap<EntityList, RegExp>();
|
||||
|
||||
/**
|
||||
* Unescapes HTML entities in text
|
||||
*
|
||||
* @example
|
||||
* ```ts
|
||||
* import { unescape } from "https://deno.land/std@$STD_VERSION/html/entities.ts";
|
||||
* import { assertEquals } from "https://deno.land/std@$STD_VERSION/testing/asserts.ts";
|
||||
*
|
||||
* // default options (only handles &<>'" and numeric entities)
|
||||
* assertEquals(unescape("<>'&AA"), "<>'&AA");
|
||||
* assertEquals(unescape("þð"), "þð");
|
||||
*
|
||||
* // using the full named entity list from the HTML spec (~47K unminified)
|
||||
* import entityList from "https://deno.land/std@$STD_VERSION/html/named_entity_list.json" assert { type: "json" };
|
||||
* assertEquals(unescape("þð", { entityList }), "þð");
|
||||
* ```
|
||||
*/
|
||||
export function unescape(
|
||||
str: string,
|
||||
options: Partial<UnescapeOptions> = {},
|
||||
) {
|
||||
const { entityList } = { ...defaultUnescapeOptions, ...options };
|
||||
|
||||
let entityRe = entityListRegexCache.get(entityList);
|
||||
|
||||
if (!entityRe) {
|
||||
entityRe = new RegExp(
|
||||
`(${
|
||||
Object.keys(entityList)
|
||||
.sort((a, b) => b.length - a.length)
|
||||
.join("|")
|
||||
})`,
|
||||
"g",
|
||||
);
|
||||
|
||||
entityListRegexCache.set(entityList, entityRe);
|
||||
}
|
||||
|
||||
return str
|
||||
.replaceAll(entityRe, (m) => entityList[m])
|
||||
.replaceAll(RX_DEC_ENTITY, (_, dec) => codePointStrToChar(dec, 10))
|
||||
.replaceAll(RX_HEX_ENTITY, (_, hex) => codePointStrToChar(hex, 16));
|
||||
}
|
||||
|
||||
function codePointStrToChar(codePointStr: string, radix: number) {
|
||||
const codePoint = parseInt(codePointStr, radix);
|
||||
|
||||
return codePoint > MAX_CODE_POINT ? "<22>" : String.fromCodePoint(codePoint);
|
||||
}
|
110
html/entities_test.ts
Normal file
110
html/entities_test.ts
Normal file
@ -0,0 +1,110 @@
|
||||
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
|
||||
|
||||
import { escape, unescape } from "./entities.ts";
|
||||
import { assertEquals } from "../testing/asserts.ts";
|
||||
import entityList from "./named_entity_list.json" assert { type: "json" };
|
||||
|
||||
Deno.test("escape", async (t) => {
|
||||
await t.step('escapes &<>"', () => {
|
||||
assertEquals(escape("&<>'\""), "&<>'"");
|
||||
});
|
||||
await t.step("escapes ' to ' (not ')", () => {
|
||||
assertEquals(escape("'"), "'");
|
||||
});
|
||||
await t.step("doesn't escape non-breaking space", () => {
|
||||
assertEquals(escape("\xa0"), "\xa0");
|
||||
});
|
||||
await t.step(
|
||||
"doesn't escape other characters, even if they have named entities",
|
||||
() => {
|
||||
assertEquals(escape("þð"), "þð");
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
Deno.test("unescape", async (t) => {
|
||||
await t.step("round-trips with escape", () => {
|
||||
const chars = "&<>'\"";
|
||||
assertEquals(unescape(escape(chars)), chars);
|
||||
});
|
||||
|
||||
await t.step("named entities", async (t) => {
|
||||
await t.step("default options", async (t) => {
|
||||
await t.step("unescapes ' as alias for ' '", () => {
|
||||
assertEquals(unescape("'"), "'");
|
||||
});
|
||||
await t.step("unescapes ", () => {
|
||||
assertEquals(unescape(" "), "\xa0");
|
||||
});
|
||||
await t.step("doesn't unescape other named entities", () => {
|
||||
assertEquals(unescape("þð"), "þð");
|
||||
});
|
||||
});
|
||||
|
||||
await t.step("full entity list", async (t) => {
|
||||
await t.step("unescapes arbitrary named entities", () => {
|
||||
assertEquals(unescape("þð", { entityList }), "þð");
|
||||
});
|
||||
await t.step(
|
||||
"unescapes truncated named entity (no trailing semicolon) if it is listed",
|
||||
() => {
|
||||
assertEquals(unescape("&", { entityList }), "&");
|
||||
},
|
||||
);
|
||||
await t.step(
|
||||
"consumes full named entity even when a truncated version is specified",
|
||||
() => {
|
||||
assertEquals(unescape("&", { entityList }), "&");
|
||||
},
|
||||
);
|
||||
await t.step(
|
||||
"doesn't unescape truncated named entity if it isn't listed",
|
||||
() => {
|
||||
assertEquals(
|
||||
unescape("∴ &therefore", { entityList }),
|
||||
"∴ &therefore",
|
||||
);
|
||||
},
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
await t.step("decimal", async (t) => {
|
||||
await t.step("unescapes decimal", () => {
|
||||
assertEquals(unescape("."), ".");
|
||||
});
|
||||
await t.step("unescapes max decimal codepoint", () => {
|
||||
assertEquals(unescape(""), "\u{10ffff}");
|
||||
});
|
||||
await t.step("unescapes decimal with leading zero", () => {
|
||||
assertEquals(unescape("."), ".");
|
||||
});
|
||||
await t.step(
|
||||
"unescapes invalid decimal codepoint to replacement character",
|
||||
() => {
|
||||
assertEquals(unescape("�"), "<22>");
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
await t.step("hex", async (t) => {
|
||||
await t.step("unescapes lower-case hex", () => {
|
||||
assertEquals(unescape("."), ".");
|
||||
});
|
||||
await t.step("unescapes upper-case hex", () => {
|
||||
assertEquals(unescape("."), ".");
|
||||
});
|
||||
await t.step("unescapes hex with leading zero", () => {
|
||||
assertEquals(unescape("."), ".");
|
||||
});
|
||||
await t.step("unescapes max hex codepoint", () => {
|
||||
assertEquals(unescape(""), "\u{10ffff}");
|
||||
});
|
||||
await t.step(
|
||||
"unescapes invalid hex codepoint to replacement character",
|
||||
() => {
|
||||
assertEquals(unescape("�"), "<22>");
|
||||
},
|
||||
);
|
||||
});
|
||||
});
|
10
html/mod.ts
Normal file
10
html/mod.ts
Normal file
@ -0,0 +1,10 @@
|
||||
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
|
||||
// This module is browser compatible.
|
||||
|
||||
/**
|
||||
* Functions for HTML tasks such as escaping or unescaping HTML entities
|
||||
*
|
||||
* @module
|
||||
*/
|
||||
|
||||
export * from "./entities.ts";
|
2233
html/named_entity_list.json
Normal file
2233
html/named_entity_list.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user