mirror of
https://github.com/denoland/std.git
synced 2024-11-21 20:50:22 +00:00
feat(html): add escape and unescape functions for HTML entities (#3335)
This commit is contained in:
parent
5199824fca
commit
6ab64b1907
17
html/_tools/generate_data.ts
Executable file
17
html/_tools/generate_data.ts
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write
|
||||||
|
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
|
||||||
|
|
||||||
|
// JSON version of the full canonical list of named HTML entities
|
||||||
|
// https://html.spec.whatwg.org/multipage/named-characters.html
|
||||||
|
import entityList from "https://html.spec.whatwg.org/entities.json" assert {
|
||||||
|
type: "json",
|
||||||
|
};
|
||||||
|
|
||||||
|
const data = Object.fromEntries(
|
||||||
|
Object.entries(entityList).map(([k, v]) => [k, v.characters]),
|
||||||
|
);
|
||||||
|
|
||||||
|
await Deno.writeTextFile(
|
||||||
|
new URL(import.meta.resolve("../named_entity_list.json")),
|
||||||
|
JSON.stringify(data, null, 2) + "\n",
|
||||||
|
);
|
104
html/entities.ts
Normal file
104
html/entities.ts
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
|
||||||
|
// This module is browser compatible.
|
||||||
|
|
||||||
|
export type EntityList = Record<string, string>;
|
||||||
|
|
||||||
|
const rawToEntityEntries = [
|
||||||
|
["&", "&"],
|
||||||
|
["<", "<"],
|
||||||
|
[">", ">"],
|
||||||
|
['"', """],
|
||||||
|
["'", "'"],
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
const defaultEntityList: EntityList = Object.fromEntries([
|
||||||
|
...rawToEntityEntries.map(([raw, entity]) => [entity, raw]),
|
||||||
|
["'", "'"],
|
||||||
|
[" ", "\xa0"],
|
||||||
|
]);
|
||||||
|
|
||||||
|
const rawToEntity = new Map<string, string>(rawToEntityEntries);
|
||||||
|
|
||||||
|
const rawRe = new RegExp(`[${[...rawToEntity.keys()].join("")}]`, "g");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Escapes text for safe interpolation into HTML text content and quoted attributes
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* import { escape } from "https://deno.land/std@$STD_VERSION/html/entities.ts";
|
||||||
|
* import { assertEquals } from "https://deno.land/std@$STD_VERSION/testing/asserts.ts";
|
||||||
|
*
|
||||||
|
* assertEquals(escape("<>'&AA"), "<>'&AA");
|
||||||
|
*
|
||||||
|
* // characters that don't need to be escaped will be left alone,
|
||||||
|
* // even if named HTML entities exist for them
|
||||||
|
* assertEquals(escape("þð"), "þð");
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
export function escape(str: string) {
|
||||||
|
return str.replaceAll(rawRe, (m) => rawToEntity.get(m)!);
|
||||||
|
}
|
||||||
|
|
||||||
|
export type UnescapeOptions = { entityList: EntityList };
|
||||||
|
|
||||||
|
const defaultUnescapeOptions: UnescapeOptions = {
|
||||||
|
entityList: defaultEntityList,
|
||||||
|
};
|
||||||
|
|
||||||
|
const MAX_CODE_POINT = 0x10ffff;
|
||||||
|
|
||||||
|
const RX_DEC_ENTITY = /&#([0-9]+);/g;
|
||||||
|
const RX_HEX_ENTITY = /&#x(\p{AHex}+);/gu;
|
||||||
|
|
||||||
|
const entityListRegexCache = new WeakMap<EntityList, RegExp>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unescapes HTML entities in text
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* ```ts
|
||||||
|
* import { unescape } from "https://deno.land/std@$STD_VERSION/html/entities.ts";
|
||||||
|
* import { assertEquals } from "https://deno.land/std@$STD_VERSION/testing/asserts.ts";
|
||||||
|
*
|
||||||
|
* // default options (only handles &<>'" and numeric entities)
|
||||||
|
* assertEquals(unescape("<>'&AA"), "<>'&AA");
|
||||||
|
* assertEquals(unescape("þð"), "þð");
|
||||||
|
*
|
||||||
|
* // using the full named entity list from the HTML spec (~47K unminified)
|
||||||
|
* import entityList from "https://deno.land/std@$STD_VERSION/html/named_entity_list.json" assert { type: "json" };
|
||||||
|
* assertEquals(unescape("þð", { entityList }), "þð");
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
export function unescape(
|
||||||
|
str: string,
|
||||||
|
options: Partial<UnescapeOptions> = {},
|
||||||
|
) {
|
||||||
|
const { entityList } = { ...defaultUnescapeOptions, ...options };
|
||||||
|
|
||||||
|
let entityRe = entityListRegexCache.get(entityList);
|
||||||
|
|
||||||
|
if (!entityRe) {
|
||||||
|
entityRe = new RegExp(
|
||||||
|
`(${
|
||||||
|
Object.keys(entityList)
|
||||||
|
.sort((a, b) => b.length - a.length)
|
||||||
|
.join("|")
|
||||||
|
})`,
|
||||||
|
"g",
|
||||||
|
);
|
||||||
|
|
||||||
|
entityListRegexCache.set(entityList, entityRe);
|
||||||
|
}
|
||||||
|
|
||||||
|
return str
|
||||||
|
.replaceAll(entityRe, (m) => entityList[m])
|
||||||
|
.replaceAll(RX_DEC_ENTITY, (_, dec) => codePointStrToChar(dec, 10))
|
||||||
|
.replaceAll(RX_HEX_ENTITY, (_, hex) => codePointStrToChar(hex, 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
function codePointStrToChar(codePointStr: string, radix: number) {
|
||||||
|
const codePoint = parseInt(codePointStr, radix);
|
||||||
|
|
||||||
|
return codePoint > MAX_CODE_POINT ? "<22>" : String.fromCodePoint(codePoint);
|
||||||
|
}
|
110
html/entities_test.ts
Normal file
110
html/entities_test.ts
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
|
||||||
|
|
||||||
|
import { escape, unescape } from "./entities.ts";
|
||||||
|
import { assertEquals } from "../testing/asserts.ts";
|
||||||
|
import entityList from "./named_entity_list.json" assert { type: "json" };
|
||||||
|
|
||||||
|
Deno.test("escape", async (t) => {
|
||||||
|
await t.step('escapes &<>"', () => {
|
||||||
|
assertEquals(escape("&<>'\""), "&<>'"");
|
||||||
|
});
|
||||||
|
await t.step("escapes ' to ' (not ')", () => {
|
||||||
|
assertEquals(escape("'"), "'");
|
||||||
|
});
|
||||||
|
await t.step("doesn't escape non-breaking space", () => {
|
||||||
|
assertEquals(escape("\xa0"), "\xa0");
|
||||||
|
});
|
||||||
|
await t.step(
|
||||||
|
"doesn't escape other characters, even if they have named entities",
|
||||||
|
() => {
|
||||||
|
assertEquals(escape("þð"), "þð");
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
Deno.test("unescape", async (t) => {
|
||||||
|
await t.step("round-trips with escape", () => {
|
||||||
|
const chars = "&<>'\"";
|
||||||
|
assertEquals(unescape(escape(chars)), chars);
|
||||||
|
});
|
||||||
|
|
||||||
|
await t.step("named entities", async (t) => {
|
||||||
|
await t.step("default options", async (t) => {
|
||||||
|
await t.step("unescapes ' as alias for ' '", () => {
|
||||||
|
assertEquals(unescape("'"), "'");
|
||||||
|
});
|
||||||
|
await t.step("unescapes ", () => {
|
||||||
|
assertEquals(unescape(" "), "\xa0");
|
||||||
|
});
|
||||||
|
await t.step("doesn't unescape other named entities", () => {
|
||||||
|
assertEquals(unescape("þð"), "þð");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
await t.step("full entity list", async (t) => {
|
||||||
|
await t.step("unescapes arbitrary named entities", () => {
|
||||||
|
assertEquals(unescape("þð", { entityList }), "þð");
|
||||||
|
});
|
||||||
|
await t.step(
|
||||||
|
"unescapes truncated named entity (no trailing semicolon) if it is listed",
|
||||||
|
() => {
|
||||||
|
assertEquals(unescape("&", { entityList }), "&");
|
||||||
|
},
|
||||||
|
);
|
||||||
|
await t.step(
|
||||||
|
"consumes full named entity even when a truncated version is specified",
|
||||||
|
() => {
|
||||||
|
assertEquals(unescape("&", { entityList }), "&");
|
||||||
|
},
|
||||||
|
);
|
||||||
|
await t.step(
|
||||||
|
"doesn't unescape truncated named entity if it isn't listed",
|
||||||
|
() => {
|
||||||
|
assertEquals(
|
||||||
|
unescape("∴ &therefore", { entityList }),
|
||||||
|
"∴ &therefore",
|
||||||
|
);
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
await t.step("decimal", async (t) => {
|
||||||
|
await t.step("unescapes decimal", () => {
|
||||||
|
assertEquals(unescape("."), ".");
|
||||||
|
});
|
||||||
|
await t.step("unescapes max decimal codepoint", () => {
|
||||||
|
assertEquals(unescape(""), "\u{10ffff}");
|
||||||
|
});
|
||||||
|
await t.step("unescapes decimal with leading zero", () => {
|
||||||
|
assertEquals(unescape("."), ".");
|
||||||
|
});
|
||||||
|
await t.step(
|
||||||
|
"unescapes invalid decimal codepoint to replacement character",
|
||||||
|
() => {
|
||||||
|
assertEquals(unescape("�"), "<22>");
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
await t.step("hex", async (t) => {
|
||||||
|
await t.step("unescapes lower-case hex", () => {
|
||||||
|
assertEquals(unescape("."), ".");
|
||||||
|
});
|
||||||
|
await t.step("unescapes upper-case hex", () => {
|
||||||
|
assertEquals(unescape("."), ".");
|
||||||
|
});
|
||||||
|
await t.step("unescapes hex with leading zero", () => {
|
||||||
|
assertEquals(unescape("."), ".");
|
||||||
|
});
|
||||||
|
await t.step("unescapes max hex codepoint", () => {
|
||||||
|
assertEquals(unescape(""), "\u{10ffff}");
|
||||||
|
});
|
||||||
|
await t.step(
|
||||||
|
"unescapes invalid hex codepoint to replacement character",
|
||||||
|
() => {
|
||||||
|
assertEquals(unescape("�"), "<22>");
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
10
html/mod.ts
Normal file
10
html/mod.ts
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
|
||||||
|
// This module is browser compatible.
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Functions for HTML tasks such as escaping or unescaping HTML entities
|
||||||
|
*
|
||||||
|
* @module
|
||||||
|
*/
|
||||||
|
|
||||||
|
export * from "./entities.ts";
|
2233
html/named_entity_list.json
Normal file
2233
html/named_entity_list.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user