gcc/contrib/unicode/gen_libstdcxx_unicode_data.py
Jakub Jelinek d0e8f58b81 contrib, libcpp, libstdc++: Update to Unicode 16.0
It is autumn again and there is a new Unicode version 16.0.

The following patch updates our Unicode stuff in contrib, libcpp and
libstdc++ from that Unicode version.

2024-10-08  Jakub Jelinek  <jakub@redhat.com>

contrib/
	* unicode/README: Update glibc git commit hash, replace
	Unicode 15 or 15.1 versions with 16.
	* unicode/gen_libstdcxx_unicode_data.py: Use 160000 instead of
	150100 in _GLIBCXX_GET_UNICODE_DATA test.
	* unicode/from_glibc/utf8_gen.py: Updated from glibc
	064c708c78cc2a6b5802dce73108fc0c1c6bfc80 commit.
	* unicode/DerivedCoreProperties.txt: Updated from Unicode 16.0.
	* unicode/emoji-data.txt: Likewise.
	* unicode/PropList.txt: Likewise.
	* unicode/GraphemeBreakProperty.txt: Likewise.
	* unicode/DerivedNormalizationProps.txt: Likewise.
	* unicode/NameAliases.txt: Likewise.
	* unicode/UnicodeData.txt: Likewise.
	* unicode/EastAsianWidth.txt: Likewise.
gcc/testsuite/
	* c-c++-common/cpp/named-universal-char-escape-1.c: Add tests
	for some Unicode 16.0 characters, both normal and generated.
libcpp/
	* makeucnid.cc (write_copyright): Update Unicode Copyright years.
	* makeuname2c.cc (generated_ranges): Adjust Unicode version from 15.1
	to 16.0.  Add EGYPTIAN HIEROGLYPH- generated range, adjust indexes in
	following entries.
	(write_copyright): Update Unicode Copyright years.
	* generated_cpp_wcwidth.h: Regenerated.
	* ucnid.h: Regenerated.
	* uname2c.h: Regenerated.
libstdc++-v3/
	* include/bits/unicode.h (std::__unicode::__v15_1_0): Rename inline
	namespace to ...
	(std::__unicode::__v16_0_0): ... this.
	(_GLIBCXX_GET_UNICODE_DATA): Change from 150100 to 160000.
	* include/bits/unicode-data.h: Regenerated.
	* testsuite/ext/unicode/properties.cc: Check for _Gcb_SpacingMark
	on U+11F03 rather than U+1D16D as the latter lost SpacingMark property
	in Unicode 16.0.
2024-10-08 10:01:47 +02:00

256 lines
9.2 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
#
# Script to generate tables for libstdc++ std::format width estimation.
#
# This file is part of GCC.
#
# GCC is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later
# version.
#
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with GCC; see the file COPYING3. If not see
# <http://www.gnu.org/licenses/>.
# To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
# ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
# ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
# ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
# ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
# Then run this script and save the output to
# ../../libstdc++-v3/include/bits/unicode-data.h
import sys
import re
import math
import os
self = os.path.basename(__file__)
print("// Generated by contrib/unicode/{}, do not edit.".format(self))
print("""
// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
/** @file bits/unicode-data.h
* This is an internal header file, included by other library headers.
* Do not attempt to use it directly. @headername{format}
*/
""")
print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
print('# error "This is not a public header, do not include it directly"')
print("#elif _GLIBCXX_GET_UNICODE_DATA != 160000")
print('# error "Version mismatch for Unicode static data"')
print("#endif\n")
# Process a list and return a list of tuples (index, val) which are the elements
# in the list that have a different val from the previous element.
# e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
# and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
def find_edges(vals, init = None):
edges = []
prev_val = init
for i, v in enumerate(vals):
if v != prev_val:
edges.append((i,v))
prev_val = v
return edges
all_code_points = []
# Process a code point value or range of code point values with given property.
def process_code_points(code_points, val):
# Example arguments:
# 1100..115F, x
# 232A, y
r = code_points.split("..")
if len(r) == 1:
c = int(r[0], base=16)
all_code_points[c] = val
elif len(r) == 2:
begin = int(r[0], base=16)
end = int(r[1], base=16) + 1
all_code_points[begin:end] = [val] * (end - begin)
else:
raise ValueError
# By default every code point has width 1. This is what the C++ standard says,
# even though the Unicode standard says some code points have width 0.
all_code_points = [1] * (1 + 0x10FFFF)
# Extract all code points with East_Asian_Width=W or East_Asian_Width=F
for line in open("EastAsianWidth.txt", "r"):
# Example lines:
# 3000 ; F
# 3001..3003 ; W
line = line.split("#")[0]
if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line):
process_code_points(line.split(";")[0], 2)
# The C++ standard also gives width 2 to the following ranges:
# U+4DC0 U+4DFF (Yijing Hexagram Symbols)
process_code_points("4DC0..4DFF", 2)
# U+1F300 U+1F5FF (Miscellaneous Symbols and Pictographs)
process_code_points("1F300..1F5FF", 2)
# U+1F900 U+1F9FF (Supplemental Symbols and Pictographs)
process_code_points("1F900..1F9FF", 2)
# Create a list that only contains the code points that have a different width
# to the previous code point.
edges = find_edges(all_code_points, 1)
# Table for std::__unicode::__format_width(char32_t)
print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
print(" // from EastAsianWidth.txt from the Unicode standard.");
print(" inline constexpr char32_t __width_edges[] = {", end="")
for i, e in enumerate(edges):
if i % 8:
print(" ", end="")
else:
print("\n ", end="")
c,_ = e
print("{:#x},".format(c), end="")
print("\n };\n")
# By default every code point has Grapheme_Cluster_Break=Other.
all_code_points = ["Other"] * (1 + 0x10FFFF)
# Extract Grapheme_Cluster_Break property for all code points.
for line in open("GraphemeBreakProperty.txt", "r"):
# Example lines:
# "0600..0605", "Prepend"
# "00AD", "Control"
line = line.split("#")[0]
if re.match(r'^[\dA-Fa-f][^;]+;', line):
code_points, grapheme_property = line.split(";")
process_code_points(code_points, grapheme_property.strip())
edges = find_edges(all_code_points)
gcb_props = {"Other":0}
for c, p in edges:
if p not in gcb_props:
gcb_props[p] = len(gcb_props)
shift_bits = int(math.ceil(math.log2(len(gcb_props))))
# Enum definition for std::__unicode::_Gcb_property
print(" enum class _Gcb_property {")
for p in gcb_props.items():
print(" _Gcb_{} = {},".format(p[0],p[1]))
print(" };\n")
# Tables for std::__unicode::_Grapheme_cluster_state
print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
print(" // from GraphemeBreakProperty.txt from the Unicode standard.");
print(" // Entries are (code_point << shift_bits) + property.")
print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
print(" inline constexpr uint32_t __gcb_edges[] = {", end="")
for i, e in enumerate(edges):
if i % 6:
print(" ", end="")
else:
print("\n ", end="")
c, p = e
x = (c << shift_bits) + gcb_props[p]
print("{0:#x},".format(x), end="")
print("\n };\n")
# By default every code point has Indic_Conjunct_Break=None.
all_code_points = [None] * (1 + 0x10FFFF)
# Extract Indic_Conjunct_Break property for all code points.
for line in open("DerivedCoreProperties.txt", "r"):
# Example lines:
# 094D ; InCB; Linker
# 0B71 ; InCB; Consonant
# 0300..034E ; InCB; Extend
line = line.split("#")[0]
if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
code_points, _, incb_property = line.split(";")
process_code_points(code_points, incb_property.strip())
# Table for std::__unicode::__is_incb_linker
# This table is tiny, so just contains the list of code points.
print(" inline constexpr char32_t __incb_linkers[] = {\n ", end="")
for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
print(" 0x{:04x},".format(i), end="")
all_code_points[i] = None
print("\n };\n")
edges = find_edges(all_code_points)
incb_props = {None:0, "Consonant":1, "Extend":2}
print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
# Table for std::__unicode::__incb_property
print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
print(" // from DerivedCoreProperties.txt from the Unicode standard.");
print(" // Entries are (code_point << 2) + property.")
print(" inline constexpr uint32_t __incb_edges[] = {", end="")
for i, e in enumerate(edges):
if i % 6:
print(" ", end="")
else:
print("\n ", end="")
c, p = e
x = (c << 2) + incb_props[p]
print("{0:#x},".format(x), end="")
print("\n };\n")
# By default every code point has Emoji=No.
all_code_points = [False] * (1 + 0x10FFFF)
# Extract Emoji=Extended_Pictographic for all code points.
for line in open("emoji-data.txt", "r"):
# Example lines:
# 1100..115F ; Extended_Pictographic
# 232A ; Extended_Pictographic
line = line.split("#")[0]
if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
process_code_points(line.split(";")[0], True)
edges = find_edges(all_code_points, False)
# Table for std::__unicode::__is_extended_pictographic
print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
print(" // from emoji-data.txt from the Unicode standard.");
print(" inline constexpr char32_t __xpicto_edges[] = {", end="")
for i, e in enumerate(edges):
if i % 8:
print(" ", end="")
else:
print("\n ", end="")
c,_ = e
print("{:#x},".format(c), end="")
print("\n };\n")
# <bits/unicode.h> gives an error if this macro is left defined.
# Do this last, so that the generated output is not usable unless we reach here.
print("#undef _GLIBCXX_GET_UNICODE_DATA")