mirror of
https://github.com/gcc-mirror/gcc.git
synced 2024-11-21 13:40:47 +00:00
Byte vs column awareness for diagnostic-show-locus.c (PR 49973)
contrib/ChangeLog 2019-12-09 Lewis Hyatt <lhyatt@gmail.com> PR preprocessor/49973 * unicode/from_glibc/unicode_utils.py: Support script from glibc (commit 464cd3) to extract character widths from Unicode data files. * unicode/from_glibc/utf8_gen.py: Likewise. * unicode/UnicodeData.txt: Unicode v. 12.1.0 data file. * unicode/EastAsianWidth.txt: Likewise. * unicode/PropList.txt: Likewise. * unicode/gen_wcwidth.py: New utility to generate libcpp/generated_cpp_wcwidth.h with help from the glibc support scripts and the Unicode data files. * unicode/unicode-license.txt: Added. * unicode/README: New explanatory file. libcpp/ChangeLog 2019-12-09 Lewis Hyatt <lhyatt@gmail.com> PR preprocessor/49973 * generated_cpp_wcwidth.h: New file generated by ../contrib/unicode/gen_wcwidth.py, supports new cpp_wcwidth function. * charset.c (compute_next_display_width): New function to help implement display columns. (cpp_byte_column_to_display_column): Likewise. (cpp_display_column_to_byte_column): Likewise. (cpp_wcwidth): Likewise. * include/cpplib.h (cpp_byte_column_to_display_column): Declare. (cpp_display_column_to_byte_column): Declare. (cpp_wcwidth): Declare. (cpp_display_width): New function. gcc/ChangeLog 2019-12-09 Lewis Hyatt <lhyatt@gmail.com> PR preprocessor/49973 * input.c (location_compute_display_column): New function to help with multibyte awareness in diagnostics. (test_cpp_utf8): New self-test. (input_c_tests): Call the new test. * input.h (location_compute_display_column): Declare. * diagnostic-show-locus.c: Pervasive changes to add multibyte awareness to all classes and functions. (enum column_unit): New enum. (class exploc_with_display_col): New class. (class layout_point): Convert m_column member to array m_columns[2]. (layout_range::contains_point): Add col_unit argument. (test_layout_range_for_single_point): Pass new argument. (test_layout_range_for_single_line): Likewise. (test_layout_range_for_multiple_lines): Likewise. (line_bounds::convert_to_display_cols): New function. (layout::get_state_at_point): Add col_unit argument. (make_range): Use empty filename rather than dummy filename. (get_line_width_without_trailing_whitespace): Rename to... (get_line_bytes_without_trailing_whitespace): ...this. (test_get_line_width_without_trailing_whitespace): Rename to... (test_get_line_bytes_without_trailing_whitespace): ...this. (class layout): m_exploc changed to exploc_with_display_col from plain expanded_location. (layout::get_linenum_width): New accessor member function. (layout::get_x_offset_display): Likewise. (layout::calculate_linenum_width): New subroutine for the constuctor. (layout::calculate_x_offset_display): Likewise. (layout::layout): Use the new subroutines. Add multibyte awareness. (layout::print_source_line): Add multibyte awareness. (layout::print_line): Likewise. (layout::print_annotation_line): Likewise. (line_label::line_label): Likewise. (layout::print_any_labels): Likewise. (layout::annotation_line_showed_range_p): Likewise. (get_printed_columns): Likewise. (class line_label): Rename m_length to m_display_width. (get_affected_columns): Rename to... (get_affected_range): ...this; add col_unit argument and multibyte awareness. (class correction): Add m_affected_bytes and m_display_cols members. Rename m_len to m_byte_length for clarity. Add multibyte awareness throughout. (correction::insertion_p): Add multibyte awareness. (correction::compute_display_cols): New function. (correction::ensure_terminated): Use new member name m_byte_length. (line_corrections::add_hint): Add multibyte awareness. (layout::print_trailing_fixits): Likewise. (layout::get_x_bound_for_row): Likewise. (test_one_liner_simple_caret_utf8): New self-test analogous to the one with _utf8 suffix removed, testing multibyte awareness. (test_one_liner_caret_and_range_utf8): Likewise. (test_one_liner_multiple_carets_and_ranges_utf8): Likewise. (test_one_liner_fixit_insert_before_utf8): Likewise. (test_one_liner_fixit_insert_after_utf8): Likewise. (test_one_liner_fixit_remove_utf8): Likewise. (test_one_liner_fixit_replace_utf8): Likewise. (test_one_liner_fixit_replace_non_equal_range_utf8): Likewise. (test_one_liner_fixit_replace_equal_secondary_range_utf8): Likewise. (test_one_liner_fixit_validation_adhoc_locations_utf8): Likewise. (test_one_liner_many_fixits_1_utf8): Likewise. (test_one_liner_many_fixits_2_utf8): Likewise. (test_one_liner_labels_utf8): Likewise. (test_diagnostic_show_locus_one_liner_utf8): Likewise. (test_overlapped_fixit_printing_utf8): Likewise. (test_overlapped_fixit_printing): Adapt for changes to get_affected_columns, get_printed_columns and class corrections. (test_overlapped_fixit_printing_2): Likewise. (test_linenum_sep): New constant. (test_left_margin): Likewise. (test_offset_impl): Helper function for new test. (test_layout_x_offset_display_utf8): New test. (diagnostic_show_locus_c_tests): Call new tests. gcc/testsuite/ChangeLog: 2019-12-09 Lewis Hyatt <lhyatt@gmail.com> PR preprocessor/49973 * gcc.dg/plugin/diagnostic_plugin_test_show_locus.c (test_show_locus): Tweak so that expected output is the same as before the diagnostic-show-locus.c changes. * gcc.dg/cpp/pr66415-1.c: Likewise. From-SVN: r279137
This commit is contained in:
parent
763c9f4a85
commit
ee9256409f
@ -1,3 +1,19 @@
|
||||
2019-12-09 Lewis Hyatt <lhyatt@gmail.com>
|
||||
|
||||
PR preprocessor/49973
|
||||
* unicode/from_glibc/unicode_utils.py: Support script from
|
||||
glibc (commit 464cd3) to extract character widths from Unicode data
|
||||
files.
|
||||
* unicode/from_glibc/utf8_gen.py: Likewise.
|
||||
* unicode/UnicodeData.txt: Unicode v. 12.1.0 data file.
|
||||
* unicode/EastAsianWidth.txt: Likewise.
|
||||
* unicode/PropList.txt: Likewise.
|
||||
* unicode/gen_wcwidth.py: New utility to generate
|
||||
libcpp/generated_cpp_wcwidth.h with help from the glibc support
|
||||
scripts and the Unicode data files.
|
||||
* unicode/unicode-license.txt: Added.
|
||||
* unicode/README: New explanatory file.
|
||||
|
||||
2019-12-07 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* texi2pod.pl: Handle @headitems in @multitables, printing them
|
||||
|
2473
contrib/unicode/EastAsianWidth.txt
Normal file
2473
contrib/unicode/EastAsianWidth.txt
Normal file
File diff suppressed because it is too large
Load Diff
1666
contrib/unicode/PropList.txt
Normal file
1666
contrib/unicode/PropList.txt
Normal file
File diff suppressed because it is too large
Load Diff
44
contrib/unicode/README
Normal file
44
contrib/unicode/README
Normal file
@ -0,0 +1,44 @@
|
||||
This directory contains a mechanism for GCC to have its own internal
|
||||
implementation of wcwidth functionality. (cpp_wcwidth () in libcpp/charset.c).
|
||||
|
||||
The idea is to produce the necessary lookup table
|
||||
(../../libcpp/generated_cpp_wcwidth.h) in a reproducible way, starting from the
|
||||
following files that are distributed by the Unicode Consortium:
|
||||
|
||||
ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
|
||||
ftp://ftp.unicode.org/Public/UNIDATA/PropList.txt
|
||||
|
||||
These three files have been added to source control in this directory;
|
||||
please see unicode-license.txt for the relevant copyright information.
|
||||
|
||||
In order to keep in sync with glibc's wcwidth as much as possible, it is
|
||||
desirable for the logic that processes the Unicode data to be the same as
|
||||
glibc's. To that end, we also put in this directory, in the from_glibc/
|
||||
directory, the glibc python code that implements their logic. This code was
|
||||
copied verbatim from glibc, and it can be updated at any time from the glibc
|
||||
source code repository. The files copied from that respository are:
|
||||
|
||||
localedata/unicode-gen/unicode_utils.py
|
||||
localedata/unicode-gen/utf8_gen.py
|
||||
|
||||
And the most recent versions added to GCC are from glibc git commit:
|
||||
2a764c6ee848dfe92cb2921ed3b14085f15d9e79
|
||||
|
||||
Finally, the script gen_wcwidth.py found here contains the GCC-specific code to
|
||||
map glibc's output to the lookup tables we require. This script should not need
|
||||
to change, unless there are structural changes to the Unicode data files or to
|
||||
the glibc code.
|
||||
|
||||
The procedure to update GCC's wcwidth tables is the following:
|
||||
|
||||
1. Update the three Unicode data files from the above URLs.
|
||||
|
||||
2. Update the two glibc files in from_glibc/ from glibc's git. Update
|
||||
the commit number above in this README.
|
||||
|
||||
3. Run ./gen_wcwidth.py X.Y > ../../libcpp/generated_cpp_wcwidth.h
|
||||
(where X.Y is the version of the Unicode standard corresponding to the
|
||||
Unicode data files being used, most recently, 12.1).
|
||||
|
||||
After that, GCC's wcwidth will match the most recent glibc.
|
32841
contrib/unicode/UnicodeData.txt
Normal file
32841
contrib/unicode/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load Diff
527
contrib/unicode/from_glibc/unicode_utils.py
Normal file
527
contrib/unicode/from_glibc/unicode_utils.py
Normal file
@ -0,0 +1,527 @@
|
||||
# Utilities to generate Unicode data for glibc from upstream Unicode data.
|
||||
#
|
||||
# Copyright (C) 2014-2019 Free Software Foundation, Inc.
|
||||
# This file is part of the GNU C Library.
|
||||
#
|
||||
# The GNU C Library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# The GNU C Library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with the GNU C Library; if not, see
|
||||
# <https://www.gnu.org/licenses/>.
|
||||
|
||||
'''
|
||||
This module contains utilities used by the scripts to generate
|
||||
Unicode data for glibc from upstream Unicode data files.
|
||||
'''
|
||||
|
||||
import sys
|
||||
import re
|
||||
|
||||
|
||||
# Common locale header.
|
||||
COMMENT_HEADER = """
|
||||
% This file is part of the GNU C Library and contains locale data.
|
||||
% The Free Software Foundation does not claim any copyright interest
|
||||
% in the locale data contained in this file. The foregoing does not
|
||||
% affect the license of the GNU C Library as a whole. It does not
|
||||
% exempt you from the conditions of the license if your use would
|
||||
% otherwise be governed by that license.
|
||||
"""
|
||||
|
||||
# Dictionary holding the entire contents of the UnicodeData.txt file
|
||||
#
|
||||
# Contents of this dictionary look like this:
|
||||
#
|
||||
# {0: {'category': 'Cc',
|
||||
# 'title': None,
|
||||
# 'digit': '',
|
||||
# 'name': '<control>',
|
||||
# 'bidi': 'BN',
|
||||
# 'combining': '0',
|
||||
# 'comment': '',
|
||||
# 'oldname': 'NULL',
|
||||
# 'decomposition': '',
|
||||
# 'upper': None,
|
||||
# 'mirrored': 'N',
|
||||
# 'lower': None,
|
||||
# 'decdigit': '',
|
||||
# 'numeric': ''},
|
||||
# …
|
||||
# }
|
||||
UNICODE_ATTRIBUTES = {}
|
||||
|
||||
# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
|
||||
#
|
||||
# Contents of this dictionary look like this:
|
||||
#
|
||||
# {917504: ['Default_Ignorable_Code_Point'],
|
||||
# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
|
||||
# …
|
||||
# }
|
||||
DERIVED_CORE_PROPERTIES = {}
|
||||
|
||||
# Dictionary holding the entire contents of the EastAsianWidths.txt file
|
||||
#
|
||||
# Contents of this dictionary look like this:
|
||||
#
|
||||
# {0: 'N', … , 45430: 'W', …}
|
||||
EAST_ASIAN_WIDTHS = {}
|
||||
|
||||
def fill_attribute(code_point, fields):
|
||||
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
|
||||
|
||||
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
|
||||
in the UnicodeData.txt file.
|
||||
|
||||
'''
|
||||
UNICODE_ATTRIBUTES[code_point] = {
|
||||
'name': fields[1], # Character name
|
||||
'category': fields[2], # General category
|
||||
'combining': fields[3], # Canonical combining classes
|
||||
'bidi': fields[4], # Bidirectional category
|
||||
'decomposition': fields[5], # Character decomposition mapping
|
||||
'decdigit': fields[6], # Decimal digit value
|
||||
'digit': fields[7], # Digit value
|
||||
'numeric': fields[8], # Numeric value
|
||||
'mirrored': fields[9], # mirrored
|
||||
'oldname': fields[10], # Old Unicode 1.0 name
|
||||
'comment': fields[11], # comment
|
||||
# Uppercase mapping
|
||||
'upper': int(fields[12], 16) if fields[12] else None,
|
||||
# Lowercase mapping
|
||||
'lower': int(fields[13], 16) if fields[13] else None,
|
||||
# Titlecase mapping
|
||||
'title': int(fields[14], 16) if fields[14] else None,
|
||||
}
|
||||
|
||||
def fill_attributes(filename):
|
||||
'''Stores the entire contents of the UnicodeData.txt file
|
||||
in the UNICODE_ATTRIBUTES dictionary.
|
||||
|
||||
A typical line for a single code point in UnicodeData.txt looks
|
||||
like this:
|
||||
|
||||
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
|
||||
|
||||
Code point ranges are indicated by pairs of lines like this:
|
||||
|
||||
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
||||
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
||||
'''
|
||||
with open(filename, mode='r') as unicode_data_file:
|
||||
fields_start = []
|
||||
for line in unicode_data_file:
|
||||
fields = line.strip().split(';')
|
||||
if len(fields) != 15:
|
||||
sys.stderr.write(
|
||||
'short line in file "%(f)s": %(l)s\n' %{
|
||||
'f': filename, 'l': line})
|
||||
exit(1)
|
||||
if fields[2] == 'Cs':
|
||||
# Surrogates are UTF-16 artefacts,
|
||||
# not real characters. Ignore them.
|
||||
fields_start = []
|
||||
continue
|
||||
if fields[1].endswith(', First>'):
|
||||
fields_start = fields
|
||||
fields_start[1] = fields_start[1].split(',')[0][1:]
|
||||
continue
|
||||
if fields[1].endswith(', Last>'):
|
||||
fields[1] = fields[1].split(',')[0][1:]
|
||||
if fields[1:] != fields_start[1:]:
|
||||
sys.stderr.write(
|
||||
'broken code point range in file "%(f)s": %(l)s\n' %{
|
||||
'f': filename, 'l': line})
|
||||
exit(1)
|
||||
for code_point in range(
|
||||
int(fields_start[0], 16),
|
||||
int(fields[0], 16)+1):
|
||||
fill_attribute(code_point, fields)
|
||||
fields_start = []
|
||||
continue
|
||||
fill_attribute(int(fields[0], 16), fields)
|
||||
fields_start = []
|
||||
|
||||
def fill_derived_core_properties(filename):
|
||||
'''Stores the entire contents of the DerivedCoreProperties.txt file
|
||||
in the DERIVED_CORE_PROPERTIES dictionary.
|
||||
|
||||
Lines in DerivedCoreProperties.txt are either a code point range like
|
||||
this:
|
||||
|
||||
0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
|
||||
|
||||
or a single code point like this:
|
||||
|
||||
00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
|
||||
|
||||
'''
|
||||
with open(filename, mode='r') as derived_core_properties_file:
|
||||
for line in derived_core_properties_file:
|
||||
match = re.match(
|
||||
r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||||
+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||||
+ r'\s*;\s*(?P<property>[a-zA-Z_]+)',
|
||||
line)
|
||||
if not match:
|
||||
continue
|
||||
start = match.group('codepoint1')
|
||||
end = match.group('codepoint2')
|
||||
if not end:
|
||||
end = start
|
||||
for code_point in range(int(start, 16), int(end, 16)+1):
|
||||
prop = match.group('property')
|
||||
if code_point in DERIVED_CORE_PROPERTIES:
|
||||
DERIVED_CORE_PROPERTIES[code_point].append(prop)
|
||||
else:
|
||||
DERIVED_CORE_PROPERTIES[code_point] = [prop]
|
||||
|
||||
def fill_east_asian_widths(filename):
|
||||
'''Stores the entire contents of the EastAsianWidths.txt file
|
||||
in the EAST_ASIAN_WIDTHS dictionary.
|
||||
|
||||
Lines in EastAsianWidths.txt are either a code point range like
|
||||
this:
|
||||
|
||||
9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
|
||||
|
||||
or a single code point like this:
|
||||
|
||||
A015;W # Lm YI SYLLABLE WU
|
||||
'''
|
||||
with open(filename, mode='r') as east_asian_widths_file:
|
||||
for line in east_asian_widths_file:
|
||||
match = re.match(
|
||||
r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||||
+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||||
+r'\s*;\s*(?P<property>[a-zA-Z]+)',
|
||||
line)
|
||||
if not match:
|
||||
continue
|
||||
start = match.group('codepoint1')
|
||||
end = match.group('codepoint2')
|
||||
if not end:
|
||||
end = start
|
||||
for code_point in range(int(start, 16), int(end, 16)+1):
|
||||
EAST_ASIAN_WIDTHS[code_point] = match.group('property')
|
||||
|
||||
def to_upper(code_point):
|
||||
'''Returns the code point of the uppercase version
|
||||
of the given code point'''
|
||||
if (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['upper']):
|
||||
return UNICODE_ATTRIBUTES[code_point]['upper']
|
||||
else:
|
||||
return code_point
|
||||
|
||||
def to_lower(code_point):
|
||||
'''Returns the code point of the lowercase version
|
||||
of the given code point'''
|
||||
if (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['lower']):
|
||||
return UNICODE_ATTRIBUTES[code_point]['lower']
|
||||
else:
|
||||
return code_point
|
||||
|
||||
def to_upper_turkish(code_point):
|
||||
'''Returns the code point of the Turkish uppercase version
|
||||
of the given code point'''
|
||||
if code_point == 0x0069:
|
||||
return 0x0130
|
||||
return to_upper(code_point)
|
||||
|
||||
def to_lower_turkish(code_point):
|
||||
'''Returns the code point of the Turkish lowercase version
|
||||
of the given code point'''
|
||||
if code_point == 0x0049:
|
||||
return 0x0131
|
||||
return to_lower(code_point)
|
||||
|
||||
def to_title(code_point):
|
||||
'''Returns the code point of the titlecase version
|
||||
of the given code point'''
|
||||
if (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['title']):
|
||||
return UNICODE_ATTRIBUTES[code_point]['title']
|
||||
else:
|
||||
return code_point
|
||||
|
||||
def is_upper(code_point):
|
||||
'''Checks whether the character with this code point is uppercase'''
|
||||
return (to_lower(code_point) != code_point
|
||||
or (code_point in DERIVED_CORE_PROPERTIES
|
||||
and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||||
|
||||
def is_lower(code_point):
|
||||
'''Checks whether the character with this code point is lowercase'''
|
||||
# Some characters are defined as “Lowercase” in
|
||||
# DerivedCoreProperties.txt but do not have a mapping to upper
|
||||
# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
|
||||
# one of these.
|
||||
return (to_upper(code_point) != code_point
|
||||
# <U00DF> is lowercase, but without simple to_upper mapping.
|
||||
or code_point == 0x00DF
|
||||
or (code_point in DERIVED_CORE_PROPERTIES
|
||||
and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||||
|
||||
def is_alpha(code_point):
|
||||
'''Checks whether the character with this code point is alphabetic'''
|
||||
return ((code_point in DERIVED_CORE_PROPERTIES
|
||||
and
|
||||
'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
|
||||
or
|
||||
# Consider all the non-ASCII digits as alphabetic.
|
||||
# ISO C 99 forbids us to have them in category “digit”,
|
||||
# but we want iswalnum to return true on them.
|
||||
(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
|
||||
and not (code_point >= 0x0030 and code_point <= 0x0039)))
|
||||
|
||||
def is_digit(code_point):
|
||||
'''Checks whether the character with this code point is a digit'''
|
||||
if False:
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
|
||||
# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
|
||||
# a zero. Must add <0> in front of them by hand.
|
||||
else:
|
||||
# SUSV2 gives us some freedom for the "digit" category, but ISO C 99
|
||||
# takes it away:
|
||||
# 7.25.2.1.5:
|
||||
# The iswdigit function tests for any wide character that
|
||||
# corresponds to a decimal-digit character (as defined in 5.2.1).
|
||||
# 5.2.1:
|
||||
# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
|
||||
return (code_point >= 0x0030 and code_point <= 0x0039)
|
||||
|
||||
def is_outdigit(code_point):
|
||||
'''Checks whether the character with this code point is outdigit'''
|
||||
return (code_point >= 0x0030 and code_point <= 0x0039)
|
||||
|
||||
def is_blank(code_point):
|
||||
'''Checks whether the character with this code point is blank'''
|
||||
return (code_point == 0x0009 # '\t'
|
||||
# Category Zs without mention of '<noBreak>'
|
||||
or (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
|
||||
and '<noBreak>' not in
|
||||
UNICODE_ATTRIBUTES[code_point]['decomposition']))
|
||||
|
||||
def is_space(code_point):
|
||||
'''Checks whether the character with this code point is a space'''
|
||||
# Don’t make U+00A0 a space. Non-breaking space means that all programs
|
||||
# should treat it like a punctuation character, not like a space.
|
||||
return (code_point == 0x0020 # ' '
|
||||
or code_point == 0x000C # '\f'
|
||||
or code_point == 0x000A # '\n'
|
||||
or code_point == 0x000D # '\r'
|
||||
or code_point == 0x0009 # '\t'
|
||||
or code_point == 0x000B # '\v'
|
||||
# Categories Zl, Zp, and Zs without mention of "<noBreak>"
|
||||
or (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and
|
||||
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
|
||||
or
|
||||
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
|
||||
and
|
||||
'<noBreak>' not in
|
||||
UNICODE_ATTRIBUTES[code_point]['decomposition']))))
|
||||
|
||||
def is_cntrl(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a control character'''
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
|
||||
or
|
||||
UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
|
||||
|
||||
def is_xdigit(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a hexadecimal digit'''
|
||||
if False:
|
||||
return (is_digit(code_point)
|
||||
or (code_point >= 0x0041 and code_point <= 0x0046)
|
||||
or (code_point >= 0x0061 and code_point <= 0x0066))
|
||||
else:
|
||||
# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
|
||||
# takes it away:
|
||||
# 7.25.2.1.12:
|
||||
# The iswxdigit function tests for any wide character that
|
||||
# corresponds to a hexadecimal-digit character (as defined
|
||||
# in 6.4.4.1).
|
||||
# 6.4.4.1:
|
||||
# hexadecimal-digit: one of
|
||||
# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
|
||||
return ((code_point >= 0x0030 and code_point <= 0x0039)
|
||||
or (code_point >= 0x0041 and code_point <= 0x0046)
|
||||
or (code_point >= 0x0061 and code_point <= 0x0066))
|
||||
|
||||
def is_graph(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a graphical character'''
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||||
and not is_space(code_point))
|
||||
|
||||
def is_print(code_point):
|
||||
'''Checks whether the character with this code point is printable'''
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||||
and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
|
||||
|
||||
def is_punct(code_point):
|
||||
'''Checks whether the character with this code point is punctuation'''
|
||||
if False:
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
|
||||
else:
|
||||
# The traditional POSIX definition of punctuation is every graphic,
|
||||
# non-alphanumeric character.
|
||||
return (is_graph(code_point)
|
||||
and not is_alpha(code_point)
|
||||
and not is_digit(code_point))
|
||||
|
||||
def is_combining(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a combining character'''
|
||||
# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
|
||||
# file. In 3.0.1 it was identical to the union of the general categories
|
||||
# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
|
||||
# PropList.txt file, so we take the latter definition.
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and
|
||||
UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
|
||||
|
||||
def is_combining_level3(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a combining level3 character'''
|
||||
return (is_combining(code_point)
|
||||
and
|
||||
int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
|
||||
|
||||
def ucs_symbol(code_point):
|
||||
'''Return the UCS symbol string for a Unicode character.'''
|
||||
if code_point < 0x10000:
|
||||
return '<U{:04X}>'.format(code_point)
|
||||
else:
|
||||
return '<U{:08X}>'.format(code_point)
|
||||
|
||||
def ucs_symbol_range(code_point_low, code_point_high):
|
||||
'''Returns a string UCS symbol string for a code point range.
|
||||
|
||||
Example:
|
||||
|
||||
<U0041>..<U005A>
|
||||
'''
|
||||
return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
|
||||
|
||||
def verifications():
|
||||
'''Tests whether the is_* functions observe the known restrictions'''
|
||||
for code_point in sorted(UNICODE_ATTRIBUTES):
|
||||
# toupper restriction: "Only characters specified for the keywords
|
||||
# lower and upper shall be specified.
|
||||
if (to_upper(code_point) != code_point
|
||||
and not (is_lower(code_point) or is_upper(code_point))):
|
||||
sys.stderr.write(
|
||||
('%(sym)s is not upper|lower '
|
||||
+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||||
'sym': ucs_symbol(code_point),
|
||||
'c': code_point,
|
||||
'uc': to_upper(code_point)})
|
||||
# tolower restriction: "Only characters specified for the keywords
|
||||
# lower and upper shall be specified.
|
||||
if (to_lower(code_point) != code_point
|
||||
and not (is_lower(code_point) or is_upper(code_point))):
|
||||
sys.stderr.write(
|
||||
('%(sym)s is not upper|lower '
|
||||
+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||||
'sym': ucs_symbol(code_point),
|
||||
'c': code_point,
|
||||
'uc': to_lower(code_point)})
|
||||
# alpha restriction: "Characters classified as either upper or lower
|
||||
# shall automatically belong to this class.
|
||||
if ((is_lower(code_point) or is_upper(code_point))
|
||||
and not is_alpha(code_point)):
|
||||
sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# alpha restriction: “No character specified for the keywords cntrl,
|
||||
# digit, punct or space shall be specified.”
|
||||
if (is_alpha(code_point) and is_cntrl(code_point)):
|
||||
sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_alpha(code_point) and is_digit(code_point)):
|
||||
sys.stderr.write('%(sym)s is alpha and digit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_alpha(code_point) and is_punct(code_point)):
|
||||
sys.stderr.write('%(sym)s is alpha and punct\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_alpha(code_point) and is_space(code_point)):
|
||||
sys.stderr.write('%(sym)s is alpha and space\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# space restriction: “No character specified for the keywords upper,
|
||||
# lower, alpha, digit, graph or xdigit shall be specified.”
|
||||
# upper, lower, alpha already checked above.
|
||||
if (is_space(code_point) and is_digit(code_point)):
|
||||
sys.stderr.write('%(sym)s is space and digit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_space(code_point) and is_graph(code_point)):
|
||||
sys.stderr.write('%(sym)s is space and graph\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_space(code_point) and is_xdigit(code_point)):
|
||||
sys.stderr.write('%(sym)s is space and xdigit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# cntrl restriction: “No character specified for the keywords upper,
|
||||
# lower, alpha, digit, punct, graph, print or xdigit shall be
|
||||
# specified.” upper, lower, alpha already checked above.
|
||||
if (is_cntrl(code_point) and is_digit(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and digit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_cntrl(code_point) and is_punct(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and punct\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_cntrl(code_point) and is_graph(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and graph\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_cntrl(code_point) and is_print(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and print\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_cntrl(code_point) and is_xdigit(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# punct restriction: “No character specified for the keywords upper,
|
||||
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
||||
# be specified.” upper, lower, alpha, cntrl already checked above.
|
||||
if (is_punct(code_point) and is_digit(code_point)):
|
||||
sys.stderr.write('%(sym)s is punct and digit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_punct(code_point) and is_xdigit(code_point)):
|
||||
sys.stderr.write('%(sym)s is punct and xdigit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_punct(code_point) and code_point == 0x0020):
|
||||
sys.stderr.write('%(sym)s is punct\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# graph restriction: “No character specified for the keyword cntrl
|
||||
# shall be specified.” Already checked above.
|
||||
|
||||
# print restriction: “No character specified for the keyword cntrl
|
||||
# shall be specified.” Already checked above.
|
||||
|
||||
# graph - print relation: differ only in the <space> character.
|
||||
# How is this possible if there are more than one space character?!
|
||||
# I think susv2/xbd/locale.html should speak of “space characters”,
|
||||
# not “space character”.
|
||||
if (is_print(code_point)
|
||||
and not (is_graph(code_point) or is_space(code_point))):
|
||||
sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
|
||||
'sym': unicode_utils.ucs_symbol(code_point)})
|
||||
if (not is_print(code_point)
|
||||
and (is_graph(code_point) or code_point == 0x0020)):
|
||||
sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
|
||||
'sym': unicode_utils.ucs_symbol(code_point)})
|
364
contrib/unicode/from_glibc/utf8_gen.py
Executable file
364
contrib/unicode/from_glibc/utf8_gen.py
Executable file
@ -0,0 +1,364 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2014-2019 Free Software Foundation, Inc.
|
||||
# This file is part of the GNU C Library.
|
||||
#
|
||||
# The GNU C Library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# The GNU C Library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with the GNU C Library; if not, see
|
||||
# <https://www.gnu.org/licenses/>.
|
||||
|
||||
'''glibc/localedata/charmaps/UTF-8 file generator script
|
||||
|
||||
This script generates a glibc/localedata/charmaps/UTF-8 file
|
||||
from Unicode data.
|
||||
|
||||
Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
|
||||
|
||||
It will output UTF-8 file
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import re
|
||||
import unicode_utils
|
||||
|
||||
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
|
||||
# sections 3.11 and 4.4.
|
||||
|
||||
JAMO_INITIAL_SHORT_NAME = (
|
||||
'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
|
||||
'C', 'K', 'T', 'P', 'H'
|
||||
)
|
||||
|
||||
JAMO_MEDIAL_SHORT_NAME = (
|
||||
'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
|
||||
'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
|
||||
)
|
||||
|
||||
JAMO_FINAL_SHORT_NAME = (
|
||||
'', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
|
||||
'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
|
||||
'P', 'H'
|
||||
)
|
||||
|
||||
def process_range(start, end, outfile, name):
|
||||
'''Writes a range of code points into the CHARMAP section of the
|
||||
output file
|
||||
|
||||
'''
|
||||
if 'Hangul Syllable' in name:
|
||||
# from glibc/localedata/ChangeLog:
|
||||
#
|
||||
# 2000-09-24 Bruno Haible <haible@clisp.cons.org>
|
||||
# * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
|
||||
# so they become printable and carry a width. Comment out surrogate
|
||||
# ranges. Add a WIDTH table
|
||||
#
|
||||
# So we expand the Hangul Syllables here:
|
||||
for i in range(int(start, 16), int(end, 16)+1 ):
|
||||
index2, index3 = divmod(i - 0xaC00, 28)
|
||||
index1, index2 = divmod(index2, 21)
|
||||
hangul_syllable_name = 'HANGUL SYLLABLE ' \
|
||||
+ JAMO_INITIAL_SHORT_NAME[index1] \
|
||||
+ JAMO_MEDIAL_SHORT_NAME[index2] \
|
||||
+ JAMO_FINAL_SHORT_NAME[index3]
|
||||
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
||||
unicode_utils.ucs_symbol(i), convert_to_hex(i),
|
||||
hangul_syllable_name))
|
||||
return
|
||||
# UnicodeData.txt file has contains code point ranges like this:
|
||||
#
|
||||
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
#
|
||||
# The glibc UTF-8 file splits ranges like these into shorter
|
||||
# ranges of 64 code points each:
|
||||
#
|
||||
# <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
||||
# …
|
||||
# <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
|
||||
for i in range(int(start, 16), int(end, 16), 64 ):
|
||||
if i > (int(end, 16)-64):
|
||||
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
||||
unicode_utils.ucs_symbol(i),
|
||||
unicode_utils.ucs_symbol(int(end,16)),
|
||||
convert_to_hex(i),
|
||||
name))
|
||||
break
|
||||
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
||||
unicode_utils.ucs_symbol(i),
|
||||
unicode_utils.ucs_symbol(i+63),
|
||||
convert_to_hex(i),
|
||||
name))
|
||||
|
||||
def process_charmap(flines, outfile):
|
||||
'''This function takes an array which contains *all* lines of
|
||||
of UnicodeData.txt and write lines to outfile as used in the
|
||||
|
||||
CHARMAP
|
||||
…
|
||||
END CHARMAP
|
||||
|
||||
section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
|
||||
|
||||
Samples for input lines:
|
||||
|
||||
0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
|
||||
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
|
||||
DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
|
||||
100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
|
||||
10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
|
||||
|
||||
Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
|
||||
|
||||
<U0010> /x10 DATA LINK ESCAPE
|
||||
<U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
||||
%<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
|
||||
%<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
|
||||
<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
|
||||
|
||||
'''
|
||||
fields_start = []
|
||||
for line in flines:
|
||||
fields = line.split(";")
|
||||
# Some characters have “<control>” as their name. We try to
|
||||
# use the “Unicode 1.0 Name” (10th field in
|
||||
# UnicodeData.txt) for them.
|
||||
#
|
||||
# The Characters U+0080, U+0081, U+0084 and U+0099 have
|
||||
# “<control>” as their name but do not even have aa
|
||||
# ”Unicode 1.0 Name”. We could write code to take their
|
||||
# alternate names from NameAliases.txt.
|
||||
if fields[1] == "<control>" and fields[10]:
|
||||
fields[1] = fields[10]
|
||||
# Handling code point ranges like:
|
||||
#
|
||||
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
|
||||
fields_start = fields
|
||||
continue
|
||||
if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
|
||||
process_range(fields_start[0], fields[0],
|
||||
outfile, fields[1][:-7]+'>')
|
||||
fields_start = []
|
||||
continue
|
||||
fields_start = []
|
||||
if 'Surrogate,' in fields[1]:
|
||||
# Comment out the surrogates in the UTF-8 file.
|
||||
# One could of course skip them completely but
|
||||
# the original UTF-8 file in glibc had them as
|
||||
# comments, so we keep these comment lines.
|
||||
outfile.write('%')
|
||||
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
||||
unicode_utils.ucs_symbol(int(fields[0], 16)),
|
||||
convert_to_hex(int(fields[0], 16)),
|
||||
fields[1]))
|
||||
|
||||
def convert_to_hex(code_point):
|
||||
'''Converts a code point to a hexadecimal UTF-8 representation
|
||||
like /x**/x**/x**.'''
|
||||
# Getting UTF8 of Unicode characters.
|
||||
# In Python3, .encode('UTF-8') does not work for
|
||||
# surrogates. Therefore, we use this conversion table
|
||||
surrogates = {
|
||||
0xD800: '/xed/xa0/x80',
|
||||
0xDB7F: '/xed/xad/xbf',
|
||||
0xDB80: '/xed/xae/x80',
|
||||
0xDBFF: '/xed/xaf/xbf',
|
||||
0xDC00: '/xed/xb0/x80',
|
||||
0xDFFF: '/xed/xbf/xbf',
|
||||
}
|
||||
if code_point in surrogates:
|
||||
return surrogates[code_point]
|
||||
return ''.join([
|
||||
'/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
|
||||
])
|
||||
|
||||
def write_header_charmap(outfile):
|
||||
'''Write the header on top of the CHARMAP section to the output file'''
|
||||
outfile.write("<code_set_name> UTF-8\n")
|
||||
outfile.write("<comment_char> %\n")
|
||||
outfile.write("<escape_char> /\n")
|
||||
outfile.write("<mb_cur_min> 1\n")
|
||||
outfile.write("<mb_cur_max> 6\n\n")
|
||||
outfile.write("% CHARMAP generated using utf8_gen.py\n")
|
||||
outfile.write("% alias ISO-10646/UTF-8\n")
|
||||
outfile.write("CHARMAP\n")
|
||||
|
||||
def write_header_width(outfile, unicode_version):
|
||||
'''Writes the header on top of the WIDTH section to the output file'''
|
||||
outfile.write('% Character width according to Unicode '
|
||||
+ '{:s}.\n'.format(unicode_version))
|
||||
outfile.write('% - Default width is 1.\n')
|
||||
outfile.write('% - Double-width characters have width 2; generated from\n')
|
||||
outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
|
||||
outfile.write('% - Non-spacing characters have width 0; '
|
||||
+ 'generated from PropList.txt or\n')
|
||||
outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
|
||||
+ 'UnicodeData.txt"\n')
|
||||
outfile.write('% - Format control characters have width 0; '
|
||||
+ 'generated from\n')
|
||||
outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
|
||||
# Not needed covered by Cf
|
||||
# outfile.write("% - Zero width characters have width 0; generated from\n")
|
||||
# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
|
||||
outfile.write("WIDTH\n")
|
||||
|
||||
def process_width(outfile, ulines, elines, plines):
|
||||
'''ulines are lines from UnicodeData.txt, elines are lines from
|
||||
EastAsianWidth.txt containing characters with width “W” or “F”,
|
||||
plines are lines from PropList.txt which contain characters
|
||||
with the property “Prepended_Concatenation_Mark”.
|
||||
|
||||
'''
|
||||
width_dict = {}
|
||||
for line in elines:
|
||||
fields = line.split(";")
|
||||
if not '..' in fields[0]:
|
||||
code_points = (fields[0], fields[0])
|
||||
else:
|
||||
code_points = fields[0].split("..")
|
||||
for key in range(int(code_points[0], 16),
|
||||
int(code_points[1], 16)+1):
|
||||
width_dict[key] = 2
|
||||
|
||||
for line in ulines:
|
||||
fields = line.split(";")
|
||||
if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
|
||||
width_dict[int(fields[0], 16)] = 0
|
||||
|
||||
for line in plines:
|
||||
# Characters with the property “Prepended_Concatenation_Mark”
|
||||
# should have the width 1:
|
||||
fields = line.split(";")
|
||||
if not '..' in fields[0]:
|
||||
code_points = (fields[0], fields[0])
|
||||
else:
|
||||
code_points = fields[0].split("..")
|
||||
for key in range(int(code_points[0], 16),
|
||||
int(code_points[1], 16)+1):
|
||||
del width_dict[key] # default width is 1
|
||||
|
||||
# handle special cases for compatibility
|
||||
for key in list((0x00AD,)):
|
||||
# https://www.cs.tut.fi/~jkorpela/shy.html
|
||||
if key in width_dict:
|
||||
del width_dict[key] # default width is 1
|
||||
for key in list(range(0x1160, 0x1200)):
|
||||
width_dict[key] = 0
|
||||
for key in list(range(0x3248, 0x3250)):
|
||||
# These are “A” which means we can decide whether to treat them
|
||||
# as “W” or “N” based on context:
|
||||
# http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
|
||||
# For us, “W” seems better.
|
||||
width_dict[key] = 2
|
||||
for key in list(range(0x4DC0, 0x4E00)):
|
||||
width_dict[key] = 2
|
||||
|
||||
same_width_lists = []
|
||||
current_width_list = []
|
||||
for key in sorted(width_dict):
|
||||
if not current_width_list:
|
||||
current_width_list = [key]
|
||||
elif (key == current_width_list[-1] + 1
|
||||
and width_dict[key] == width_dict[current_width_list[0]]):
|
||||
current_width_list.append(key)
|
||||
else:
|
||||
same_width_lists.append(current_width_list)
|
||||
current_width_list = [key]
|
||||
if current_width_list:
|
||||
same_width_lists.append(current_width_list)
|
||||
|
||||
for same_width_list in same_width_lists:
|
||||
if len(same_width_list) == 1:
|
||||
outfile.write('{:s}\t{:d}\n'.format(
|
||||
unicode_utils.ucs_symbol(same_width_list[0]),
|
||||
width_dict[same_width_list[0]]))
|
||||
else:
|
||||
outfile.write('{:s}...{:s}\t{:d}\n'.format(
|
||||
unicode_utils.ucs_symbol(same_width_list[0]),
|
||||
unicode_utils.ucs_symbol(same_width_list[-1]),
|
||||
width_dict[same_width_list[0]]))
|
||||
|
||||
if __name__ == "__main__":
|
||||
PARSER = argparse.ArgumentParser(
|
||||
description='''
|
||||
Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
|
||||
''')
|
||||
PARSER.add_argument(
|
||||
'-u', '--unicode_data_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default='UnicodeData.txt',
|
||||
help=('The UnicodeData.txt file to read, '
|
||||
+ 'default: %(default)s'))
|
||||
PARSER.add_argument(
|
||||
'-e', '--east_asian_with_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default='EastAsianWidth.txt',
|
||||
help=('The EastAsianWidth.txt file to read, '
|
||||
+ 'default: %(default)s'))
|
||||
PARSER.add_argument(
|
||||
'-p', '--prop_list_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default='PropList.txt',
|
||||
help=('The PropList.txt file to read, '
|
||||
+ 'default: %(default)s'))
|
||||
PARSER.add_argument(
|
||||
'--unicode_version',
|
||||
nargs='?',
|
||||
required=True,
|
||||
type=str,
|
||||
help='The Unicode version of the input files used.')
|
||||
ARGS = PARSER.parse_args()
|
||||
|
||||
with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
|
||||
UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
|
||||
with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
|
||||
EAST_ASIAN_WIDTH_LINES = []
|
||||
for LINE in EAST_ASIAN_WIDTH_FILE:
|
||||
# If characters from EastAasianWidth.txt which are from
|
||||
# from reserved ranges (i.e. not yet assigned code points)
|
||||
# are added to the WIDTH section of the UTF-8 file, then
|
||||
# “make check” produces “Unknown Character” errors for
|
||||
# these code points because such unassigned code points
|
||||
# are not in the CHARMAP section of the UTF-8 file.
|
||||
#
|
||||
# Therefore, we skip all reserved code points when reading
|
||||
# the EastAsianWidth.txt file.
|
||||
if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
|
||||
continue
|
||||
if re.match(r'^[^;]*;[WF]', LINE):
|
||||
EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
|
||||
with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
|
||||
PROP_LIST_LINES = []
|
||||
for LINE in PROP_LIST_FILE:
|
||||
if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
|
||||
PROP_LIST_LINES.append(LINE.strip())
|
||||
with open('UTF-8', mode='w') as OUTFILE:
|
||||
# Processing UnicodeData.txt and write CHARMAP to UTF-8 file
|
||||
write_header_charmap(OUTFILE)
|
||||
process_charmap(UNICODE_DATA_LINES, OUTFILE)
|
||||
OUTFILE.write("END CHARMAP\n\n")
|
||||
# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
|
||||
write_header_width(OUTFILE, ARGS.unicode_version)
|
||||
process_width(OUTFILE,
|
||||
UNICODE_DATA_LINES,
|
||||
EAST_ASIAN_WIDTH_LINES,
|
||||
PROP_LIST_LINES)
|
||||
OUTFILE.write("END WIDTH\n")
|
106
contrib/unicode/gen_wcwidth.py
Executable file
106
contrib/unicode/gen_wcwidth.py
Executable file
@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
|
||||
#
|
||||
# This file is part of GCC.
|
||||
#
|
||||
# GCC is free software; you can redistribute it and/or modify it under
|
||||
# the terms of the GNU General Public License as published by the Free
|
||||
# Software Foundation; either version 3, or (at your option) any later
|
||||
# version.
|
||||
#
|
||||
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
# for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with GCC; see the file COPYING3. If not see
|
||||
# <http://www.gnu.org/licenses/>. */
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print("usage: %s <unicode version>", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
unicode_version = sys.argv[1]
|
||||
|
||||
# Parse a codepoint in the format output by glibc tools.
|
||||
def parse_ucn(s):
|
||||
if not (s.startswith("<U") and s.endswith(">")):
|
||||
raise ValueError
|
||||
return int(s[2:-1], base=16)
|
||||
|
||||
# Process a line of width output from utf_gen.py and update global array.
|
||||
widths = [1] * (1 + 0x10FFFF)
|
||||
def process_width(line):
|
||||
# Example lines:
|
||||
# <UA8FF> 0
|
||||
# <UA926>...<UA92D> 0
|
||||
|
||||
s = line.split()
|
||||
width = int(s[1])
|
||||
r = s[0].split("...")
|
||||
if len(r) == 1:
|
||||
begin = parse_ucn(r[0])
|
||||
end = begin + 1
|
||||
elif len(r) == 2:
|
||||
begin = parse_ucn(r[0])
|
||||
end = parse_ucn(r[1]) + 1
|
||||
else:
|
||||
raise ValueError
|
||||
widths[begin:end] = [width] * (end - begin)
|
||||
|
||||
# To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a
|
||||
# file named UTF-8, which is not configurable. Then we parse this into the form
|
||||
# we want it.
|
||||
os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
|
||||
processing = False
|
||||
for line in open("UTF-8", "r"):
|
||||
if processing:
|
||||
if line == "END WIDTH\n":
|
||||
processing = False
|
||||
else:
|
||||
try:
|
||||
process_width(line)
|
||||
except (ValueError, IndexError):
|
||||
print(e, "warning: ignored unexpected line: %s" % line,
|
||||
file=sys.stderr, end="")
|
||||
elif line == "WIDTH\n":
|
||||
processing = True
|
||||
|
||||
# All bytes < 256 we treat as width 1.
|
||||
widths[0:255] = [1] * 255
|
||||
|
||||
# Condense the list to contiguous ranges.
|
||||
cur_range = [-1, 1]
|
||||
all_ranges = []
|
||||
for i, width in enumerate(widths):
|
||||
if width == cur_range[1]:
|
||||
cur_range[0] = i
|
||||
else:
|
||||
all_ranges.append(cur_range)
|
||||
cur_range = [i, width]
|
||||
|
||||
# Output the arrays for generated_cpp_wcwidth.h
|
||||
print("/* Generated by contrib/unicode/gen_wcwidth.py,",
|
||||
"with the help of glibc's")
|
||||
print(" utf8_gen.py, using version %s" % unicode_version,
|
||||
"of the Unicode standard. */")
|
||||
print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
|
||||
for i, r in enumerate(all_ranges):
|
||||
if i % 8:
|
||||
print(" ", end="")
|
||||
else:
|
||||
print("\n ", end="")
|
||||
print("0x%x," % (r[0]), end="")
|
||||
print("\n};\n")
|
||||
print("static const unsigned char wcwidth_widths[] = {", end="")
|
||||
for i, r in enumerate(all_ranges):
|
||||
if i % 24:
|
||||
print(" ", end="")
|
||||
else:
|
||||
print("\n ", end="")
|
||||
print("%d," % r[1], end="")
|
||||
print("\n};")
|
50
contrib/unicode/unicode-license.txt
Normal file
50
contrib/unicode/unicode-license.txt
Normal file
@ -0,0 +1,50 @@
|
||||
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||
|
||||
Unicode Data Files include all data files under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/, and
|
||||
http://www.unicode.org/cldr/data/. Unicode Data Files do not include PDF
|
||||
online code charts under the directory http://www.unicode.org/Public/.
|
||||
Software includes any source code published in the Unicode Standard or under
|
||||
the directories http://www.unicode.org/Public/,
|
||||
http://www.unicode.org/reports/, and http://www.unicode.org/cldr/data/.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement. BY
|
||||
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES
|
||||
("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND
|
||||
AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF
|
||||
YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA
|
||||
FILES OR SOFTWARE.
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 1991-2013 Unicode, Inc. All rights reserved. Distributed under
|
||||
the Terms of Use in http://www.unicode.org/copyright.html.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of the Unicode data files and any associated documentation (the "Data
|
||||
Files") or Unicode software and any associated documentation (the "Software")
|
||||
to deal in the Data Files or Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge, publish, distribute, and/or
|
||||
sell copies of the Data Files or Software, and to permit persons to whom the
|
||||
Data Files or Software are furnished to do so, provided that (a) the above
|
||||
copyright notice(s) and this permission notice appear with all copies of the
|
||||
Data Files or Software, (b) both the above copyright notice(s) and this
|
||||
permission notice appear in associated documentation, and (c) there is clear
|
||||
notice in each modified Data File or in the Software as well as in the
|
||||
documentation associated with the Data File(s) or Software that the data or
|
||||
software has been modified.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD
|
||||
PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
|
||||
THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
|
||||
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE
|
||||
DATA FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder shall
|
||||
not be used in advertising or otherwise to promote the sale, use or other
|
||||
dealings in these Data Files or Software without prior written authorization
|
||||
of the copyright holder.
|
@ -1,3 +1,79 @@
|
||||
2019-12-09 Lewis Hyatt <lhyatt@gmail.com>
|
||||
|
||||
PR preprocessor/49973
|
||||
* input.c (location_compute_display_column): New function to help with
|
||||
multibyte awareness in diagnostics.
|
||||
(test_cpp_utf8): New self-test.
|
||||
(input_c_tests): Call the new test.
|
||||
* input.h (location_compute_display_column): Declare.
|
||||
* diagnostic-show-locus.c: Pervasive changes to add multibyte awareness
|
||||
to all classes and functions.
|
||||
(enum column_unit): New enum.
|
||||
(class exploc_with_display_col): New class.
|
||||
(class layout_point): Convert m_column member to array m_columns[2].
|
||||
(layout_range::contains_point): Add col_unit argument.
|
||||
(test_layout_range_for_single_point): Pass new argument.
|
||||
(test_layout_range_for_single_line): Likewise.
|
||||
(test_layout_range_for_multiple_lines): Likewise.
|
||||
(line_bounds::convert_to_display_cols): New function.
|
||||
(layout::get_state_at_point): Add col_unit argument.
|
||||
(make_range): Use empty filename rather than dummy filename.
|
||||
(get_line_width_without_trailing_whitespace): Rename to...
|
||||
(get_line_bytes_without_trailing_whitespace): ...this.
|
||||
(test_get_line_width_without_trailing_whitespace): Rename to...
|
||||
(test_get_line_bytes_without_trailing_whitespace): ...this.
|
||||
(class layout): m_exploc changed to exploc_with_display_col from
|
||||
plain expanded_location.
|
||||
(layout::get_linenum_width): New accessor member function.
|
||||
(layout::get_x_offset_display): Likewise.
|
||||
(layout::calculate_linenum_width): New subroutine for the constuctor.
|
||||
(layout::calculate_x_offset_display): Likewise.
|
||||
(layout::layout): Use the new subroutines. Add multibyte awareness.
|
||||
(layout::print_source_line): Add multibyte awareness.
|
||||
(layout::print_line): Likewise.
|
||||
(layout::print_annotation_line): Likewise.
|
||||
(line_label::line_label): Likewise.
|
||||
(layout::print_any_labels): Likewise.
|
||||
(layout::annotation_line_showed_range_p): Likewise.
|
||||
(get_printed_columns): Likewise.
|
||||
(class line_label): Rename m_length to m_display_width.
|
||||
(get_affected_columns): Rename to...
|
||||
(get_affected_range): ...this; add col_unit argument and multibyte
|
||||
awareness.
|
||||
(class correction): Add m_affected_bytes and m_display_cols
|
||||
members. Rename m_len to m_byte_length for clarity. Add multibyte
|
||||
awareness throughout.
|
||||
(correction::insertion_p): Add multibyte awareness.
|
||||
(correction::compute_display_cols): New function.
|
||||
(correction::ensure_terminated): Use new member name m_byte_length.
|
||||
(line_corrections::add_hint): Add multibyte awareness.
|
||||
(layout::print_trailing_fixits): Likewise.
|
||||
(layout::get_x_bound_for_row): Likewise.
|
||||
(test_one_liner_simple_caret_utf8): New self-test analogous to the one
|
||||
with _utf8 suffix removed, testing multibyte awareness.
|
||||
(test_one_liner_caret_and_range_utf8): Likewise.
|
||||
(test_one_liner_multiple_carets_and_ranges_utf8): Likewise.
|
||||
(test_one_liner_fixit_insert_before_utf8): Likewise.
|
||||
(test_one_liner_fixit_insert_after_utf8): Likewise.
|
||||
(test_one_liner_fixit_remove_utf8): Likewise.
|
||||
(test_one_liner_fixit_replace_utf8): Likewise.
|
||||
(test_one_liner_fixit_replace_non_equal_range_utf8): Likewise.
|
||||
(test_one_liner_fixit_replace_equal_secondary_range_utf8): Likewise.
|
||||
(test_one_liner_fixit_validation_adhoc_locations_utf8): Likewise.
|
||||
(test_one_liner_many_fixits_1_utf8): Likewise.
|
||||
(test_one_liner_many_fixits_2_utf8): Likewise.
|
||||
(test_one_liner_labels_utf8): Likewise.
|
||||
(test_diagnostic_show_locus_one_liner_utf8): Likewise.
|
||||
(test_overlapped_fixit_printing_utf8): Likewise.
|
||||
(test_overlapped_fixit_printing): Adapt for changes to
|
||||
get_affected_columns, get_printed_columns and class corrections.
|
||||
(test_overlapped_fixit_printing_2): Likewise.
|
||||
(test_linenum_sep): New constant.
|
||||
(test_left_margin): Likewise.
|
||||
(test_offset_impl): Helper function for new test.
|
||||
(test_layout_x_offset_display_utf8): New test.
|
||||
(diagnostic_show_locus_c_tests): Call new tests.
|
||||
|
||||
2019-12-09 Eric Botcazou <ebotcazou@adacore.com>
|
||||
|
||||
* tree.c (build_array_type_1): Add SET_CANONICAL parameter and compute
|
||||
|
File diff suppressed because it is too large
Load Diff
105
gcc/input.c
105
gcc/input.c
@ -908,6 +908,22 @@ make_location (location_t caret, source_range src_range)
|
||||
return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
|
||||
}
|
||||
|
||||
/* An expanded_location stores the column in byte units. This function
|
||||
converts that column to display units. That requires reading the associated
|
||||
source line in order to calculate the display width. If that cannot be done
|
||||
for any reason, then returns the byte column as a fallback. */
|
||||
int
|
||||
location_compute_display_column (expanded_location exploc)
|
||||
{
|
||||
if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
|
||||
return exploc.column;
|
||||
char_span line = location_get_source_line (exploc.file, exploc.line);
|
||||
/* If line is NULL, this function returns exploc.column which is the
|
||||
desired fallback. */
|
||||
return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
|
||||
exploc.column);
|
||||
}
|
||||
|
||||
/* Dump statistics to stderr about the memory usage of the line_table
|
||||
set of line maps. This also displays some statistics about macro
|
||||
expansion. */
|
||||
@ -3590,6 +3606,93 @@ test_line_offset_overflow ()
|
||||
ASSERT_NE (ordmap_a, ordmap_b);
|
||||
}
|
||||
|
||||
void test_cpp_utf8 ()
|
||||
{
|
||||
/* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
|
||||
{
|
||||
int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8);
|
||||
ASSERT_EQ (8, w_bad);
|
||||
int w_ctrl = cpp_display_width ("\r\t\n\v\0\1", 6);
|
||||
ASSERT_EQ (6, w_ctrl);
|
||||
}
|
||||
|
||||
/* Verify that wcwidth of valid UTF-8 is as expected. */
|
||||
{
|
||||
const int w_pi = cpp_display_width ("\xcf\x80", 2);
|
||||
ASSERT_EQ (1, w_pi);
|
||||
const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4);
|
||||
ASSERT_EQ (2, w_emoji);
|
||||
const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2);
|
||||
ASSERT_EQ (1, w_umlaut_precomposed);
|
||||
const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3);
|
||||
ASSERT_EQ (1, w_umlaut_combining);
|
||||
const int w_han = cpp_display_width ("\xe4\xb8\xba", 3);
|
||||
ASSERT_EQ (2, w_han);
|
||||
const int w_ascii = cpp_display_width ("GCC", 3);
|
||||
ASSERT_EQ (3, w_ascii);
|
||||
const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
|
||||
"\x9f! \xe4\xb8\xba y\xcc\x88", 24);
|
||||
ASSERT_EQ (18, w_mixed);
|
||||
}
|
||||
|
||||
/* Verify that cpp_byte_column_to_display_column can go past the end,
|
||||
and similar edge cases. */
|
||||
{
|
||||
const char *str
|
||||
/* Display columns.
|
||||
111111112345 */
|
||||
= "\xcf\x80 abc";
|
||||
/* 111122223456
|
||||
Byte columns. */
|
||||
|
||||
ASSERT_EQ (5, cpp_display_width (str, 6));
|
||||
ASSERT_EQ (105, cpp_byte_column_to_display_column (str, 6, 106));
|
||||
ASSERT_EQ (10000, cpp_byte_column_to_display_column (NULL, 0, 10000));
|
||||
ASSERT_EQ (0, cpp_byte_column_to_display_column (NULL, 10000, 0));
|
||||
}
|
||||
|
||||
/* Verify that cpp_display_column_to_byte_column can go past the end,
|
||||
and similar edge cases, and check invertibility. */
|
||||
{
|
||||
const char *str
|
||||
/* Display columns.
|
||||
000000000000000000000000000000000000011
|
||||
111111112222222234444444455555555678901 */
|
||||
= "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
|
||||
/* 000000000000000000000000000000000111111
|
||||
111122223333444456666777788889999012345
|
||||
Byte columns. */
|
||||
ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2));
|
||||
ASSERT_EQ (15, cpp_display_column_to_byte_column (str, 15, 11));
|
||||
ASSERT_EQ (115, cpp_display_column_to_byte_column (str, 15, 111));
|
||||
ASSERT_EQ (10000, cpp_display_column_to_byte_column (NULL, 0, 10000));
|
||||
ASSERT_EQ (0, cpp_display_column_to_byte_column (NULL, 10000, 0));
|
||||
|
||||
/* Verify that we do not interrupt a UTF-8 sequence. */
|
||||
ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1));
|
||||
|
||||
for (int byte_col = 1; byte_col <= 15; ++byte_col)
|
||||
{
|
||||
const int disp_col = cpp_byte_column_to_display_column (str, 15,
|
||||
byte_col);
|
||||
const int byte_col2 = cpp_display_column_to_byte_column (str, 15,
|
||||
disp_col);
|
||||
|
||||
/* If we ask for the display column in the middle of a UTF-8
|
||||
sequence, it will return the length of the partial sequence,
|
||||
matching the behavior of GCC before display column support.
|
||||
Otherwise check the round trip was successful. */
|
||||
if (byte_col < 4)
|
||||
ASSERT_EQ (byte_col, disp_col);
|
||||
else if (byte_col >= 6 && byte_col < 9)
|
||||
ASSERT_EQ (3 + (byte_col - 5), disp_col);
|
||||
else
|
||||
ASSERT_EQ (byte_col2, byte_col);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Run all of the selftests within this file. */
|
||||
|
||||
void
|
||||
@ -3631,6 +3734,8 @@ input_c_tests ()
|
||||
test_reading_source_line ();
|
||||
|
||||
test_line_offset_overflow ();
|
||||
|
||||
test_cpp_utf8 ();
|
||||
}
|
||||
|
||||
} // namespace selftest
|
||||
|
@ -38,6 +38,7 @@ STATIC_ASSERT (BUILTINS_LOCATION < RESERVED_LOCATION_COUNT);
|
||||
|
||||
extern bool is_location_from_builtin_token (location_t);
|
||||
extern expanded_location expand_location (location_t);
|
||||
extern int location_compute_display_column (expanded_location);
|
||||
|
||||
/* A class capturing the bounds of a buffer, to allow for run-time
|
||||
bounds-checking in a checked build. */
|
||||
|
@ -1,3 +1,11 @@
|
||||
2019-12-09 Lewis Hyatt <lhyatt@gmail.com>
|
||||
|
||||
PR preprocessor/49973
|
||||
* gcc.dg/plugin/diagnostic_plugin_test_show_locus.c
|
||||
(test_show_locus): Tweak so that expected output is the same as
|
||||
before the diagnostic-show-locus.c changes.
|
||||
* gcc.dg/cpp/pr66415-1.c: Likewise.
|
||||
|
||||
2019-12-09 Eric Botcazou <ebotcazou@adacore.com>
|
||||
|
||||
* gnat.dg/lto23.adb: New test.
|
||||
|
@ -1,7 +1,7 @@
|
||||
/* PR c/66415 */
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-Wformat -fdiagnostics-show-caret" } */
|
||||
/* { dg-set-compiler-env-var COLUMNS "82" } */
|
||||
/* { dg-set-compiler-env-var COLUMNS "83" } */
|
||||
|
||||
void
|
||||
fn1 (void)
|
||||
|
@ -174,7 +174,7 @@ test_show_locus (function *fun)
|
||||
|
||||
/* Hardcode the "terminal width", to verify the behavior of
|
||||
very wide lines. */
|
||||
global_dc->caret_max_width = 70;
|
||||
global_dc->caret_max_width = 71;
|
||||
|
||||
if (0 == strcmp (fnname, "test_simple"))
|
||||
{
|
||||
|
@ -1,3 +1,18 @@
|
||||
2019-12-09 Lewis Hyatt <lhyatt@gmail.com>
|
||||
|
||||
PR preprocessor/49973
|
||||
* generated_cpp_wcwidth.h: New file generated by
|
||||
../contrib/unicode/gen_wcwidth.py, supports new cpp_wcwidth function.
|
||||
* charset.c (compute_next_display_width): New function to help
|
||||
implement display columns.
|
||||
(cpp_byte_column_to_display_column): Likewise.
|
||||
(cpp_display_column_to_byte_column): Likewise.
|
||||
(cpp_wcwidth): Likewise.
|
||||
* include/cpplib.h (cpp_byte_column_to_display_column): Declare.
|
||||
(cpp_display_column_to_byte_column): Declare.
|
||||
(cpp_wcwidth): Declare.
|
||||
(cpp_display_width): New function.
|
||||
|
||||
2019-11-14 Joseph Myers <joseph@codesourcery.com>
|
||||
|
||||
* charset.c (narrow_str_to_charconst): Make CPP_UTF8CHAR constants
|
||||
|
103
libcpp/charset.c
103
libcpp/charset.c
@ -2265,3 +2265,106 @@ cpp_string_location_reader::get_next ()
|
||||
m_loc += m_offset_per_column;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Helper for cpp_byte_column_to_display_column and its inverse. Given a
|
||||
pointer to a UTF-8-encoded character, compute its display width. *INBUFP
|
||||
points on entry to the start of the UTF-8 encoding of the character, and
|
||||
is updated to point just after the last byte of the encoding. *INBYTESLEFTP
|
||||
contains on entry the remaining size of the buffer into which *INBUFP
|
||||
points, and this is also updated accordingly. If *INBUFP does not
|
||||
point to a valid UTF-8-encoded sequence, then it will be treated as a single
|
||||
byte with display width 1. */
|
||||
|
||||
static inline int
|
||||
compute_next_display_width (const uchar **inbufp, size_t *inbytesleftp)
|
||||
{
|
||||
cppchar_t c;
|
||||
if (one_utf8_to_cppchar (inbufp, inbytesleftp, &c) != 0)
|
||||
{
|
||||
/* Input is not convertible to UTF-8. This could be fine, e.g. in a
|
||||
string literal, so don't complain. Just treat it as if it has a width
|
||||
of one. */
|
||||
++*inbufp;
|
||||
--*inbytesleftp;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* one_utf8_to_cppchar() has updated inbufp and inbytesleftp for us. */
|
||||
return cpp_wcwidth (c);
|
||||
}
|
||||
|
||||
/* For the string of length DATA_LENGTH bytes that begins at DATA, compute
|
||||
how many display columns are occupied by the first COLUMN bytes. COLUMN
|
||||
may exceed DATA_LENGTH, in which case the phantom bytes at the end are
|
||||
treated as if they have display width 1. */
|
||||
|
||||
int
|
||||
cpp_byte_column_to_display_column (const char *data, int data_length,
|
||||
int column)
|
||||
{
|
||||
int display_col = 0;
|
||||
const uchar *udata = (const uchar *) data;
|
||||
const int offset = MAX (0, column - data_length);
|
||||
size_t inbytesleft = column - offset;
|
||||
while (inbytesleft)
|
||||
display_col += compute_next_display_width (&udata, &inbytesleft);
|
||||
return display_col + offset;
|
||||
}
|
||||
|
||||
/* For the string of length DATA_LENGTH bytes that begins at DATA, compute
|
||||
the least number of bytes that will result in at least DISPLAY_COL display
|
||||
columns. The return value may exceed DATA_LENGTH if the entire string does
|
||||
not occupy enough display columns. */
|
||||
|
||||
int
|
||||
cpp_display_column_to_byte_column (const char *data, int data_length,
|
||||
int display_col)
|
||||
{
|
||||
int column = 0;
|
||||
const uchar *udata = (const uchar *) data;
|
||||
size_t inbytesleft = data_length;
|
||||
while (column < display_col && inbytesleft)
|
||||
column += compute_next_display_width (&udata, &inbytesleft);
|
||||
return data_length - inbytesleft + MAX (0, display_col - column);
|
||||
}
|
||||
|
||||
/* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc,
|
||||
because that will inspect the user's locale, and in particular in an ASCII
|
||||
locale, it will not return anything useful for extended characters. But GCC
|
||||
in other respects (see e.g. _cpp_default_encoding()) behaves as if
|
||||
everything is UTF-8. We also make some tweaks that are useful for the way
|
||||
GCC needs to use this data, e.g. tabs and other control characters should be
|
||||
treated as having width 1. The lookup tables are generated from
|
||||
contrib/unicode/gen_wcwidth.py and were made by simply calling glibc
|
||||
wcwidth() on all codepoints, then applying the small tweaks. These tables
|
||||
are not highly optimized, but for the present purpose of outputting
|
||||
diagnostics, they are sufficient. */
|
||||
|
||||
#include "generated_cpp_wcwidth.h"
|
||||
int cpp_wcwidth (cppchar_t c)
|
||||
{
|
||||
if (__builtin_expect (c <= wcwidth_range_ends[0], true))
|
||||
return wcwidth_widths[0];
|
||||
|
||||
/* Binary search the tables. */
|
||||
int begin = 1;
|
||||
static const int end
|
||||
= sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends);
|
||||
int len = end - begin;
|
||||
do
|
||||
{
|
||||
int half = len/2;
|
||||
int middle = begin + half;
|
||||
if (c > wcwidth_range_ends[middle])
|
||||
{
|
||||
begin = middle + 1;
|
||||
len -= half + 1;
|
||||
}
|
||||
else
|
||||
len = half;
|
||||
} while (len);
|
||||
|
||||
if (__builtin_expect (begin != end, true))
|
||||
return wcwidth_widths[begin];
|
||||
return 1;
|
||||
}
|
||||
|
156
libcpp/generated_cpp_wcwidth.h
Normal file
156
libcpp/generated_cpp_wcwidth.h
Normal file
@ -0,0 +1,156 @@
|
||||
/* Generated by contrib/unicode/gen_wcwidth.py, with the help of glibc's
|
||||
utf8_gen.py, using version 12.1.0 of the Unicode standard. */
|
||||
|
||||
static const cppchar_t wcwidth_range_ends[] = {
|
||||
0x2ff, 0x36f, 0x482, 0x489, 0x590, 0x5bd, 0x5be, 0x5bf,
|
||||
0x5c0, 0x5c2, 0x5c3, 0x5c5, 0x5c6, 0x5c7, 0x60f, 0x61a,
|
||||
0x61b, 0x61c, 0x64a, 0x65f, 0x66f, 0x670, 0x6d5, 0x6dc,
|
||||
0x6de, 0x6e4, 0x6e6, 0x6e8, 0x6e9, 0x6ed, 0x710, 0x711,
|
||||
0x72f, 0x74a, 0x7a5, 0x7b0, 0x7ea, 0x7f3, 0x7fc, 0x7fd,
|
||||
0x815, 0x819, 0x81a, 0x823, 0x824, 0x827, 0x828, 0x82d,
|
||||
0x858, 0x85b, 0x8d2, 0x8e1, 0x8e2, 0x902, 0x939, 0x93a,
|
||||
0x93b, 0x93c, 0x940, 0x948, 0x94c, 0x94d, 0x950, 0x957,
|
||||
0x961, 0x963, 0x980, 0x981, 0x9bb, 0x9bc, 0x9c0, 0x9c4,
|
||||
0x9cc, 0x9cd, 0x9e1, 0x9e3, 0x9fd, 0x9fe, 0xa00, 0xa02,
|
||||
0xa3b, 0xa3c, 0xa40, 0xa42, 0xa46, 0xa48, 0xa4a, 0xa4d,
|
||||
0xa50, 0xa51, 0xa6f, 0xa71, 0xa74, 0xa75, 0xa80, 0xa82,
|
||||
0xabb, 0xabc, 0xac0, 0xac5, 0xac6, 0xac8, 0xacc, 0xacd,
|
||||
0xae1, 0xae3, 0xaf9, 0xaff, 0xb00, 0xb01, 0xb3b, 0xb3c,
|
||||
0xb3e, 0xb3f, 0xb40, 0xb44, 0xb4c, 0xb4d, 0xb55, 0xb56,
|
||||
0xb61, 0xb63, 0xb81, 0xb82, 0xbbf, 0xbc0, 0xbcc, 0xbcd,
|
||||
0xbff, 0xc00, 0xc03, 0xc04, 0xc3d, 0xc40, 0xc45, 0xc48,
|
||||
0xc49, 0xc4d, 0xc54, 0xc56, 0xc61, 0xc63, 0xc80, 0xc81,
|
||||
0xcbb, 0xcbc, 0xcbe, 0xcbf, 0xcc5, 0xcc6, 0xccb, 0xccd,
|
||||
0xce1, 0xce3, 0xcff, 0xd01, 0xd3a, 0xd3c, 0xd40, 0xd44,
|
||||
0xd4c, 0xd4d, 0xd61, 0xd63, 0xdc9, 0xdca, 0xdd1, 0xdd4,
|
||||
0xdd5, 0xdd6, 0xe30, 0xe31, 0xe33, 0xe3a, 0xe46, 0xe4e,
|
||||
0xeb0, 0xeb1, 0xeb3, 0xebc, 0xec7, 0xecd, 0xf17, 0xf19,
|
||||
0xf34, 0xf35, 0xf36, 0xf37, 0xf38, 0xf39, 0xf70, 0xf7e,
|
||||
0xf7f, 0xf84, 0xf85, 0xf87, 0xf8c, 0xf97, 0xf98, 0xfbc,
|
||||
0xfc5, 0xfc6, 0x102c, 0x1030, 0x1031, 0x1037, 0x1038, 0x103a,
|
||||
0x103c, 0x103e, 0x1057, 0x1059, 0x105d, 0x1060, 0x1070, 0x1074,
|
||||
0x1081, 0x1082, 0x1084, 0x1086, 0x108c, 0x108d, 0x109c, 0x109d,
|
||||
0x10ff, 0x115f, 0x11ff, 0x135c, 0x135f, 0x1711, 0x1714, 0x1731,
|
||||
0x1734, 0x1751, 0x1753, 0x1771, 0x1773, 0x17b3, 0x17b5, 0x17b6,
|
||||
0x17bd, 0x17c5, 0x17c6, 0x17c8, 0x17d3, 0x17dc, 0x17dd, 0x180a,
|
||||
0x180e, 0x1884, 0x1886, 0x18a8, 0x18a9, 0x191f, 0x1922, 0x1926,
|
||||
0x1928, 0x1931, 0x1932, 0x1938, 0x193b, 0x1a16, 0x1a18, 0x1a1a,
|
||||
0x1a1b, 0x1a55, 0x1a56, 0x1a57, 0x1a5e, 0x1a5f, 0x1a60, 0x1a61,
|
||||
0x1a62, 0x1a64, 0x1a6c, 0x1a72, 0x1a7c, 0x1a7e, 0x1a7f, 0x1aaf,
|
||||
0x1abe, 0x1aff, 0x1b03, 0x1b33, 0x1b34, 0x1b35, 0x1b3a, 0x1b3b,
|
||||
0x1b3c, 0x1b41, 0x1b42, 0x1b6a, 0x1b73, 0x1b7f, 0x1b81, 0x1ba1,
|
||||
0x1ba5, 0x1ba7, 0x1ba9, 0x1baa, 0x1bad, 0x1be5, 0x1be6, 0x1be7,
|
||||
0x1be9, 0x1bec, 0x1bed, 0x1bee, 0x1bf1, 0x1c2b, 0x1c33, 0x1c35,
|
||||
0x1c37, 0x1ccf, 0x1cd2, 0x1cd3, 0x1ce0, 0x1ce1, 0x1ce8, 0x1cec,
|
||||
0x1ced, 0x1cf3, 0x1cf4, 0x1cf7, 0x1cf9, 0x1dbf, 0x1df9, 0x1dfa,
|
||||
0x1dff, 0x200a, 0x200f, 0x2029, 0x202e, 0x205f, 0x2064, 0x2065,
|
||||
0x206f, 0x20cf, 0x20f0, 0x2319, 0x231b, 0x2328, 0x232a, 0x23e8,
|
||||
0x23ec, 0x23ef, 0x23f0, 0x23f2, 0x23f3, 0x25fc, 0x25fe, 0x2613,
|
||||
0x2615, 0x2647, 0x2653, 0x267e, 0x267f, 0x2692, 0x2693, 0x26a0,
|
||||
0x26a1, 0x26a9, 0x26ab, 0x26bc, 0x26be, 0x26c3, 0x26c5, 0x26cd,
|
||||
0x26ce, 0x26d3, 0x26d4, 0x26e9, 0x26ea, 0x26f1, 0x26f3, 0x26f4,
|
||||
0x26f5, 0x26f9, 0x26fa, 0x26fc, 0x26fd, 0x2704, 0x2705, 0x2709,
|
||||
0x270b, 0x2727, 0x2728, 0x274b, 0x274c, 0x274d, 0x274e, 0x2752,
|
||||
0x2755, 0x2756, 0x2757, 0x2794, 0x2797, 0x27af, 0x27b0, 0x27be,
|
||||
0x27bf, 0x2b1a, 0x2b1c, 0x2b4f, 0x2b50, 0x2b54, 0x2b55, 0x2cee,
|
||||
0x2cf1, 0x2d7e, 0x2d7f, 0x2ddf, 0x2dff, 0x2e7f, 0x2e99, 0x2e9a,
|
||||
0x2ef3, 0x2eff, 0x2fd5, 0x2fef, 0x2ffb, 0x2fff, 0x3029, 0x302d,
|
||||
0x303e, 0x3040, 0x3096, 0x3098, 0x309a, 0x30ff, 0x3104, 0x312f,
|
||||
0x3130, 0x318e, 0x318f, 0x31ba, 0x31bf, 0x31e3, 0x31ef, 0x321e,
|
||||
0x321f, 0x4db5, 0x4dbf, 0x9fef, 0x9fff, 0xa48c, 0xa48f, 0xa4c6,
|
||||
0xa66e, 0xa672, 0xa673, 0xa67d, 0xa69d, 0xa69f, 0xa6ef, 0xa6f1,
|
||||
0xa801, 0xa802, 0xa805, 0xa806, 0xa80a, 0xa80b, 0xa824, 0xa826,
|
||||
0xa8c3, 0xa8c5, 0xa8df, 0xa8f1, 0xa8fe, 0xa8ff, 0xa925, 0xa92d,
|
||||
0xa946, 0xa951, 0xa95f, 0xa97c, 0xa97f, 0xa982, 0xa9b2, 0xa9b3,
|
||||
0xa9b5, 0xa9b9, 0xa9bb, 0xa9bd, 0xa9e4, 0xa9e5, 0xaa28, 0xaa2e,
|
||||
0xaa30, 0xaa32, 0xaa34, 0xaa36, 0xaa42, 0xaa43, 0xaa4b, 0xaa4c,
|
||||
0xaa7b, 0xaa7c, 0xaaaf, 0xaab0, 0xaab1, 0xaab4, 0xaab6, 0xaab8,
|
||||
0xaabd, 0xaabf, 0xaac0, 0xaac1, 0xaaeb, 0xaaed, 0xaaf5, 0xaaf6,
|
||||
0xabe4, 0xabe5, 0xabe7, 0xabe8, 0xabec, 0xabed, 0xabff, 0xd7a3,
|
||||
0xf8ff, 0xfa6d, 0xfa6f, 0xfad9, 0xfb1d, 0xfb1e, 0xfdff, 0xfe0f,
|
||||
0xfe19, 0xfe1f, 0xfe2f, 0xfe52, 0xfe53, 0xfe66, 0xfe67, 0xfe6b,
|
||||
0xfefe, 0xfeff, 0xff00, 0xff60, 0xffdf, 0xffe6, 0xfff8, 0xfffb,
|
||||
0x101fc, 0x101fd, 0x102df, 0x102e0, 0x10375, 0x1037a, 0x10a00, 0x10a03,
|
||||
0x10a04, 0x10a06, 0x10a0b, 0x10a0f, 0x10a37, 0x10a3a, 0x10a3e, 0x10a3f,
|
||||
0x10ae4, 0x10ae6, 0x10d23, 0x10d27, 0x10f45, 0x10f50, 0x11000, 0x11001,
|
||||
0x11037, 0x11046, 0x1107e, 0x11081, 0x110b2, 0x110b6, 0x110b8, 0x110ba,
|
||||
0x110ff, 0x11102, 0x11126, 0x1112b, 0x1112c, 0x11134, 0x11172, 0x11173,
|
||||
0x1117f, 0x11181, 0x111b5, 0x111be, 0x111c8, 0x111cc, 0x1122e, 0x11231,
|
||||
0x11233, 0x11234, 0x11235, 0x11237, 0x1123d, 0x1123e, 0x112de, 0x112df,
|
||||
0x112e2, 0x112ea, 0x112ff, 0x11301, 0x1133a, 0x1133c, 0x1133f, 0x11340,
|
||||
0x11365, 0x1136c, 0x1136f, 0x11374, 0x11437, 0x1143f, 0x11441, 0x11444,
|
||||
0x11445, 0x11446, 0x1145d, 0x1145e, 0x114b2, 0x114b8, 0x114b9, 0x114ba,
|
||||
0x114be, 0x114c0, 0x114c1, 0x114c3, 0x115b1, 0x115b5, 0x115bb, 0x115bd,
|
||||
0x115be, 0x115c0, 0x115db, 0x115dd, 0x11632, 0x1163a, 0x1163c, 0x1163d,
|
||||
0x1163e, 0x11640, 0x116aa, 0x116ab, 0x116ac, 0x116ad, 0x116af, 0x116b5,
|
||||
0x116b6, 0x116b7, 0x1171c, 0x1171f, 0x11721, 0x11725, 0x11726, 0x1172b,
|
||||
0x1182e, 0x11837, 0x11838, 0x1183a, 0x119d3, 0x119d7, 0x119d9, 0x119db,
|
||||
0x119df, 0x119e0, 0x11a00, 0x11a0a, 0x11a32, 0x11a38, 0x11a3a, 0x11a3e,
|
||||
0x11a46, 0x11a47, 0x11a50, 0x11a56, 0x11a58, 0x11a5b, 0x11a89, 0x11a96,
|
||||
0x11a97, 0x11a99, 0x11c2f, 0x11c36, 0x11c37, 0x11c3d, 0x11c3e, 0x11c3f,
|
||||
0x11c91, 0x11ca7, 0x11ca9, 0x11cb0, 0x11cb1, 0x11cb3, 0x11cb4, 0x11cb6,
|
||||
0x11d30, 0x11d36, 0x11d39, 0x11d3a, 0x11d3b, 0x11d3d, 0x11d3e, 0x11d45,
|
||||
0x11d46, 0x11d47, 0x11d8f, 0x11d91, 0x11d94, 0x11d95, 0x11d96, 0x11d97,
|
||||
0x11ef2, 0x11ef4, 0x1342f, 0x13438, 0x16aef, 0x16af4, 0x16b2f, 0x16b36,
|
||||
0x16f4e, 0x16f4f, 0x16f8e, 0x16f92, 0x16fdf, 0x16fe3, 0x16fff, 0x187f7,
|
||||
0x187ff, 0x18af2, 0x1afff, 0x1b11e, 0x1b14f, 0x1b152, 0x1b163, 0x1b167,
|
||||
0x1b16f, 0x1b2fb, 0x1bc9c, 0x1bc9e, 0x1bc9f, 0x1bca3, 0x1d166, 0x1d169,
|
||||
0x1d172, 0x1d182, 0x1d184, 0x1d18b, 0x1d1a9, 0x1d1ad, 0x1d241, 0x1d244,
|
||||
0x1d9ff, 0x1da36, 0x1da3a, 0x1da6c, 0x1da74, 0x1da75, 0x1da83, 0x1da84,
|
||||
0x1da9a, 0x1da9f, 0x1daa0, 0x1daaf, 0x1dfff, 0x1e006, 0x1e007, 0x1e018,
|
||||
0x1e01a, 0x1e021, 0x1e022, 0x1e024, 0x1e025, 0x1e02a, 0x1e12f, 0x1e136,
|
||||
0x1e2eb, 0x1e2ef, 0x1e8cf, 0x1e8d6, 0x1e943, 0x1e94a, 0x1f003, 0x1f004,
|
||||
0x1f0ce, 0x1f0cf, 0x1f18d, 0x1f18e, 0x1f190, 0x1f19a, 0x1f1ff, 0x1f202,
|
||||
0x1f20f, 0x1f23b, 0x1f23f, 0x1f248, 0x1f24f, 0x1f251, 0x1f25f, 0x1f265,
|
||||
0x1f2ff, 0x1f320, 0x1f32c, 0x1f335, 0x1f336, 0x1f37c, 0x1f37d, 0x1f393,
|
||||
0x1f39f, 0x1f3ca, 0x1f3ce, 0x1f3d3, 0x1f3df, 0x1f3f0, 0x1f3f3, 0x1f3f4,
|
||||
0x1f3f7, 0x1f43e, 0x1f43f, 0x1f440, 0x1f441, 0x1f4fc, 0x1f4fe, 0x1f53d,
|
||||
0x1f54a, 0x1f54e, 0x1f54f, 0x1f567, 0x1f579, 0x1f57a, 0x1f594, 0x1f596,
|
||||
0x1f5a3, 0x1f5a4, 0x1f5fa, 0x1f64f, 0x1f67f, 0x1f6c5, 0x1f6cb, 0x1f6cc,
|
||||
0x1f6cf, 0x1f6d2, 0x1f6d4, 0x1f6d5, 0x1f6ea, 0x1f6ec, 0x1f6f3, 0x1f6fa,
|
||||
0x1f7df, 0x1f7eb, 0x1f90c, 0x1f971, 0x1f972, 0x1f976, 0x1f979, 0x1f9a2,
|
||||
0x1f9a4, 0x1f9aa, 0x1f9ad, 0x1f9ca, 0x1f9cc, 0x1f9ff, 0x1fa6f, 0x1fa73,
|
||||
0x1fa77, 0x1fa7a, 0x1fa7f, 0x1fa82, 0x1fa8f, 0x1fa95, 0x1ffff, 0x2a6d6,
|
||||
0x2a6ff, 0x2b734, 0x2b73f, 0x2b81d, 0x2b81f, 0x2cea1, 0x2ceaf, 0x2ebe0,
|
||||
0x2f7ff, 0x2fa1d, 0xe0000, 0xe0001, 0xe001f, 0xe007f, 0xe00ff, 0xe01ef,
|
||||
};
|
||||
|
||||
static const unsigned char wcwidth_widths[] = {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
|
||||
2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
|
||||
2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
|
||||
0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 0, 2, 1, 2, 1, 0, 2, 1, 2,
|
||||
1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0,
|
||||
2, 1, 0, 2, 1, 2, 1, 2, 1, 0, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
|
||||
1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
|
||||
1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
|
||||
1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
|
||||
1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0,
|
||||
};
|
@ -1320,4 +1320,15 @@ extern bool cpp_userdef_char_p
|
||||
extern const char * cpp_get_userdef_suffix
|
||||
(const cpp_token *);
|
||||
|
||||
/* In charset.c */
|
||||
int cpp_byte_column_to_display_column (const char *data, int data_length,
|
||||
int column);
|
||||
inline int cpp_display_width (const char *data, int data_length)
|
||||
{
|
||||
return cpp_byte_column_to_display_column (data, data_length, data_length);
|
||||
}
|
||||
int cpp_display_column_to_byte_column (const char *data, int data_length,
|
||||
int display_col);
|
||||
int cpp_wcwidth (cppchar_t c);
|
||||
|
||||
#endif /* ! LIBCPP_CPPLIB_H */
|
||||
|
Loading…
Reference in New Issue
Block a user