mirror of
https://github.com/gcc-mirror/gcc.git
synced 2024-11-21 13:40:47 +00:00
d0e8f58b81
It is autumn again and there is a new Unicode version 16.0. The following patch updates our Unicode stuff in contrib, libcpp and libstdc++ from that Unicode version. 2024-10-08 Jakub Jelinek <jakub@redhat.com> contrib/ * unicode/README: Update glibc git commit hash, replace Unicode 15 or 15.1 versions with 16. * unicode/gen_libstdcxx_unicode_data.py: Use 160000 instead of 150100 in _GLIBCXX_GET_UNICODE_DATA test. * unicode/from_glibc/utf8_gen.py: Updated from glibc 064c708c78cc2a6b5802dce73108fc0c1c6bfc80 commit. * unicode/DerivedCoreProperties.txt: Updated from Unicode 16.0. * unicode/emoji-data.txt: Likewise. * unicode/PropList.txt: Likewise. * unicode/GraphemeBreakProperty.txt: Likewise. * unicode/DerivedNormalizationProps.txt: Likewise. * unicode/NameAliases.txt: Likewise. * unicode/UnicodeData.txt: Likewise. * unicode/EastAsianWidth.txt: Likewise. gcc/testsuite/ * c-c++-common/cpp/named-universal-char-escape-1.c: Add tests for some Unicode 16.0 characters, both normal and generated. libcpp/ * makeucnid.cc (write_copyright): Update Unicode Copyright years. * makeuname2c.cc (generated_ranges): Adjust Unicode version from 15.1 to 16.0. Add EGYPTIAN HIEROGLYPH- generated range, adjust indexes in following entries. (write_copyright): Update Unicode Copyright years. * generated_cpp_wcwidth.h: Regenerated. * ucnid.h: Regenerated. * uname2c.h: Regenerated. libstdc++-v3/ * include/bits/unicode.h (std::__unicode::__v15_1_0): Rename inline namespace to ... (std::__unicode::__v16_0_0): ... this. (_GLIBCXX_GET_UNICODE_DATA): Change from 150100 to 160000. * include/bits/unicode-data.h: Regenerated. * testsuite/ext/unicode/properties.cc: Check for _Gcb_SpacingMark on U+11F03 rather than U+1D16D as the latter lost SpacingMark property in Unicode 16.0.
256 lines
9.2 KiB
Python
Executable File
256 lines
9.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
#
|
||
# Script to generate tables for libstdc++ std::format width estimation.
|
||
#
|
||
# This file is part of GCC.
|
||
#
|
||
# GCC is free software; you can redistribute it and/or modify it under
|
||
# the terms of the GNU General Public License as published by the Free
|
||
# Software Foundation; either version 3, or (at your option) any later
|
||
# version.
|
||
#
|
||
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||
# for more details.
|
||
#
|
||
# You should have received a copy of the GNU General Public License
|
||
# along with GCC; see the file COPYING3. If not see
|
||
# <http://www.gnu.org/licenses/>.
|
||
|
||
# To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
|
||
# ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
|
||
# ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
|
||
# ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
|
||
# ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
|
||
# Then run this script and save the output to
|
||
# ../../libstdc++-v3/include/bits/unicode-data.h
|
||
|
||
import sys
|
||
import re
|
||
import math
|
||
import os
|
||
|
||
self = os.path.basename(__file__)
|
||
print("// Generated by contrib/unicode/{}, do not edit.".format(self))
|
||
print("""
|
||
// Copyright The GNU Toolchain Authors.
|
||
//
|
||
// This file is part of the GNU ISO C++ Library. This library is free
|
||
// software; you can redistribute it and/or modify it under the
|
||
// terms of the GNU General Public License as published by the
|
||
// Free Software Foundation; either version 3, or (at your option)
|
||
// any later version.
|
||
|
||
// This library is distributed in the hope that it will be useful,
|
||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
// GNU General Public License for more details.
|
||
|
||
// Under Section 7 of GPL version 3, you are granted additional
|
||
// permissions described in the GCC Runtime Library Exception, version
|
||
// 3.1, as published by the Free Software Foundation.
|
||
|
||
// You should have received a copy of the GNU General Public License and
|
||
// a copy of the GCC Runtime Library Exception along with this program;
|
||
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||
// <http://www.gnu.org/licenses/>.
|
||
|
||
/** @file bits/unicode-data.h
|
||
* This is an internal header file, included by other library headers.
|
||
* Do not attempt to use it directly. @headername{format}
|
||
*/
|
||
""")
|
||
print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
|
||
print('# error "This is not a public header, do not include it directly"')
|
||
print("#elif _GLIBCXX_GET_UNICODE_DATA != 160000")
|
||
print('# error "Version mismatch for Unicode static data"')
|
||
print("#endif\n")
|
||
|
||
# Process a list and return a list of tuples (index, val) which are the elements
|
||
# in the list that have a different val from the previous element.
|
||
# e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
|
||
# and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
|
||
def find_edges(vals, init = None):
|
||
edges = []
|
||
prev_val = init
|
||
for i, v in enumerate(vals):
|
||
if v != prev_val:
|
||
edges.append((i,v))
|
||
prev_val = v
|
||
return edges
|
||
|
||
all_code_points = []
|
||
|
||
# Process a code point value or range of code point values with given property.
|
||
def process_code_points(code_points, val):
|
||
# Example arguments:
|
||
# 1100..115F, x
|
||
# 232A, y
|
||
|
||
r = code_points.split("..")
|
||
if len(r) == 1:
|
||
c = int(r[0], base=16)
|
||
all_code_points[c] = val
|
||
elif len(r) == 2:
|
||
begin = int(r[0], base=16)
|
||
end = int(r[1], base=16) + 1
|
||
all_code_points[begin:end] = [val] * (end - begin)
|
||
else:
|
||
raise ValueError
|
||
|
||
# By default every code point has width 1. This is what the C++ standard says,
|
||
# even though the Unicode standard says some code points have width 0.
|
||
all_code_points = [1] * (1 + 0x10FFFF)
|
||
|
||
# Extract all code points with East_Asian_Width=W or East_Asian_Width=F
|
||
for line in open("EastAsianWidth.txt", "r"):
|
||
# Example lines:
|
||
# 3000 ; F
|
||
# 3001..3003 ; W
|
||
line = line.split("#")[0]
|
||
if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line):
|
||
process_code_points(line.split(";")[0], 2)
|
||
|
||
# The C++ standard also gives width 2 to the following ranges:
|
||
# U+4DC0 – U+4DFF (Yijing Hexagram Symbols)
|
||
process_code_points("4DC0..4DFF", 2)
|
||
# U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs)
|
||
process_code_points("1F300..1F5FF", 2)
|
||
# U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs)
|
||
process_code_points("1F900..1F9FF", 2)
|
||
|
||
# Create a list that only contains the code points that have a different width
|
||
# to the previous code point.
|
||
edges = find_edges(all_code_points, 1)
|
||
|
||
# Table for std::__unicode::__format_width(char32_t)
|
||
|
||
print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
|
||
print(" // from EastAsianWidth.txt from the Unicode standard.");
|
||
print(" inline constexpr char32_t __width_edges[] = {", end="")
|
||
for i, e in enumerate(edges):
|
||
if i % 8:
|
||
print(" ", end="")
|
||
else:
|
||
print("\n ", end="")
|
||
c,_ = e
|
||
print("{:#x},".format(c), end="")
|
||
print("\n };\n")
|
||
|
||
# By default every code point has Grapheme_Cluster_Break=Other.
|
||
all_code_points = ["Other"] * (1 + 0x10FFFF)
|
||
|
||
# Extract Grapheme_Cluster_Break property for all code points.
|
||
for line in open("GraphemeBreakProperty.txt", "r"):
|
||
# Example lines:
|
||
# "0600..0605", "Prepend"
|
||
# "00AD", "Control"
|
||
line = line.split("#")[0]
|
||
if re.match(r'^[\dA-Fa-f][^;]+;', line):
|
||
code_points, grapheme_property = line.split(";")
|
||
process_code_points(code_points, grapheme_property.strip())
|
||
|
||
edges = find_edges(all_code_points)
|
||
gcb_props = {"Other":0}
|
||
for c, p in edges:
|
||
if p not in gcb_props:
|
||
gcb_props[p] = len(gcb_props)
|
||
shift_bits = int(math.ceil(math.log2(len(gcb_props))))
|
||
|
||
# Enum definition for std::__unicode::_Gcb_property
|
||
|
||
print(" enum class _Gcb_property {")
|
||
for p in gcb_props.items():
|
||
print(" _Gcb_{} = {},".format(p[0],p[1]))
|
||
print(" };\n")
|
||
|
||
# Tables for std::__unicode::_Grapheme_cluster_state
|
||
|
||
print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
|
||
print(" // from GraphemeBreakProperty.txt from the Unicode standard.");
|
||
print(" // Entries are (code_point << shift_bits) + property.")
|
||
print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
|
||
print(" inline constexpr uint32_t __gcb_edges[] = {", end="")
|
||
for i, e in enumerate(edges):
|
||
if i % 6:
|
||
print(" ", end="")
|
||
else:
|
||
print("\n ", end="")
|
||
c, p = e
|
||
x = (c << shift_bits) + gcb_props[p]
|
||
print("{0:#x},".format(x), end="")
|
||
print("\n };\n")
|
||
|
||
# By default every code point has Indic_Conjunct_Break=None.
|
||
all_code_points = [None] * (1 + 0x10FFFF)
|
||
|
||
# Extract Indic_Conjunct_Break property for all code points.
|
||
for line in open("DerivedCoreProperties.txt", "r"):
|
||
# Example lines:
|
||
# 094D ; InCB; Linker
|
||
# 0B71 ; InCB; Consonant
|
||
# 0300..034E ; InCB; Extend
|
||
line = line.split("#")[0]
|
||
if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
|
||
code_points, _, incb_property = line.split(";")
|
||
process_code_points(code_points, incb_property.strip())
|
||
|
||
# Table for std::__unicode::__is_incb_linker
|
||
# This table is tiny, so just contains the list of code points.
|
||
print(" inline constexpr char32_t __incb_linkers[] = {\n ", end="")
|
||
for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
|
||
print(" 0x{:04x},".format(i), end="")
|
||
all_code_points[i] = None
|
||
print("\n };\n")
|
||
|
||
edges = find_edges(all_code_points)
|
||
|
||
incb_props = {None:0, "Consonant":1, "Extend":2}
|
||
print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
|
||
# Table for std::__unicode::__incb_property
|
||
print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
|
||
print(" // from DerivedCoreProperties.txt from the Unicode standard.");
|
||
print(" // Entries are (code_point << 2) + property.")
|
||
print(" inline constexpr uint32_t __incb_edges[] = {", end="")
|
||
for i, e in enumerate(edges):
|
||
if i % 6:
|
||
print(" ", end="")
|
||
else:
|
||
print("\n ", end="")
|
||
c, p = e
|
||
x = (c << 2) + incb_props[p]
|
||
print("{0:#x},".format(x), end="")
|
||
print("\n };\n")
|
||
|
||
# By default every code point has Emoji=No.
|
||
all_code_points = [False] * (1 + 0x10FFFF)
|
||
|
||
# Extract Emoji=Extended_Pictographic for all code points.
|
||
for line in open("emoji-data.txt", "r"):
|
||
# Example lines:
|
||
# 1100..115F ; Extended_Pictographic
|
||
# 232A ; Extended_Pictographic
|
||
line = line.split("#")[0]
|
||
if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
|
||
process_code_points(line.split(";")[0], True)
|
||
|
||
edges = find_edges(all_code_points, False)
|
||
|
||
# Table for std::__unicode::__is_extended_pictographic
|
||
print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
|
||
print(" // from emoji-data.txt from the Unicode standard.");
|
||
print(" inline constexpr char32_t __xpicto_edges[] = {", end="")
|
||
for i, e in enumerate(edges):
|
||
if i % 8:
|
||
print(" ", end="")
|
||
else:
|
||
print("\n ", end="")
|
||
c,_ = e
|
||
print("{:#x},".format(c), end="")
|
||
print("\n };\n")
|
||
|
||
# <bits/unicode.h> gives an error if this macro is left defined.
|
||
# Do this last, so that the generated output is not usable unless we reach here.
|
||
print("#undef _GLIBCXX_GET_UNICODE_DATA")
|