#!/usr/bin/env python3 # # Check gcc.pot file for stylistic issues as described in # https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html, # especially in gcc-internal-format messages. # # This file is part of GCC. # # GCC is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 3, or (at your option) any later # version. # # GCC is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License # along with GCC; see the file COPYING3. If not see # . import argparse import re from collections import Counter from typing import Dict, Match import polib seen_warnings = Counter() def location(msg: polib.POEntry): if msg.occurrences: occ = msg.occurrences[0] return f'{occ[0]}:{occ[1]}' return '' def warn(msg: polib.POEntry, diagnostic_id: str, diagnostic: str, include_msgid=True): """ To suppress a warning for a particular message, add a line "#, gcclint:ignore:{diagnostic_id}" to the message. """ if f'gcclint:ignore:{diagnostic_id}' in msg.flags: return seen_warnings[diagnostic] += 1 if include_msgid: print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}') else: print(f'{location(msg)}: {diagnostic}') def lint_gcc_internal_format(msg: polib.POEntry): """ Checks a single message that has the gcc-internal-format. These messages use a variety of placeholders like %qs, % and %q#E. """ msgid: str = msg.msgid def outside_quotes(m: Match[str]): before = msgid[:m.start(0)] return before.count('%<') == before.count('%>') def lint_matching_placeholders(): """ Warns when literal values in placeholders are not exactly equal in the translation. This can happen when doing copy-and-paste translations of similar messages. To avoid these mismatches in the first place, structurally equal messages are found by lint_diagnostics_differing_only_in_placeholders. This check only applies when checking a finished translation such as de.po, not gcc.pot. """ if not msg.translated(): return in_msgid = re.findall('%<[^%]+%>', msgid) in_msgstr = re.findall('%<[^%]+%>', msg.msgstr) if set(in_msgid) != set(in_msgstr): warn(msg, 'placeholder-mismatch', f'placeholder mismatch: msgid has {in_msgid}, ' f'msgstr has {in_msgstr}', include_msgid=False) def lint_option_outside_quotes(): for match in re.finditer(r'\S+', msgid): part = match.group() if not outside_quotes(match): continue if part.startswith('-'): if len(part) >= 2 and part[1].isalpha(): if part == '-INF': continue warn(msg, 'option-outside-quotes', 'command line option outside %') if part.startswith('__builtin_'): warn(msg, 'builtin-outside-quotes', 'builtin function outside %') def lint_plain_apostrophe(): for match in re.finditer("[^%]'", msgid): if outside_quotes(match): warn(msg, 'apostrophe', 'apostrophe without leading %') def lint_space_before_quote(): """ A space before %< is often the result of string literals that are joined by the C compiler and neither literal has a space to separate the words. """ for match in re.finditer('(.?[a-zA-Z0-9])%<', msgid): if match.group(1) != '%s': warn(msg, 'no-space-before-quote', '%< directly following a letter or digit') def lint_underscore_outside_quotes(): """ An underscore outside of quotes is used in several contexts, and many of them violate the GCC Guidelines for Diagnostics: * names of GCC-internal compiler functions * names of GCC-internal data structures * static_cast and the like (which are legitimate) """ for match in re.finditer('_', msgid): if outside_quotes(match): warn(msg, 'underscore-outside-quotes', 'underscore outside of %') return def lint_may_not(): """ The term "may not" may either mean "it could be the case" or "should not". These two different meanings are sometimes hard to tell apart. """ if re.search(r'\bmay not\b', msgid): warn(msg, 'ambiguous-may-not', 'the term "may not" is ambiguous') def lint_unbalanced_quotes(): if msgid.count('%<') != msgid.count('%>'): warn(msg, 'unbalanced-quotes', 'unbalanced %< and %> quotes') if msg.translated(): if msg.msgstr.count('%<') != msg.msgstr.count('%>'): warn(msg, 'unbalanced-quotes', 'unbalanced %< and %> quotes') def lint_single_space_after_sentence(): """ After a sentence there should be two spaces. """ if re.search(r'[.] [A-Z]', msgid): warn(msg, 'single-space-after-sentence', 'single space after sentence') def lint_non_canonical_quotes(): """ Catches %<%s%>, which can be written in the shorter form %qs. """ match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid) if match: warn(msg, 'non-canonical-quotes', f'placeholder {match.group()} should be written as %qs') lint_option_outside_quotes() lint_plain_apostrophe() lint_space_before_quote() lint_underscore_outside_quotes() lint_may_not() lint_unbalanced_quotes() lint_matching_placeholders() lint_single_space_after_sentence() lint_non_canonical_quotes() def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile): """ Detects messages that are structurally the same, except that they use different plain strings inside %. These messages can be merged in order to prevent copy-and-paste mistakes by the translators. See bug 90119. """ seen: Dict[str, polib.POEntry] = {} for msg in po: msg: polib.POEntry msgid = msg.msgid normalized = re.sub('%<[^%]+%>', '%qs', msgid) if normalized not in seen: seen[normalized] = msg seen[msgid] = msg continue prev = seen[normalized] warn(msg, 'same-pattern', f'same pattern for {repr(msgid)} and ' f'{repr(prev.msgid)} in {location(prev)}', include_msgid=False) def lint_file(po: polib.POFile): for msg in po: msg: polib.POEntry if not msg.obsolete and not msg.fuzzy: if 'gcc-internal-format' in msg.flags: lint_gcc_internal_format(msg) lint_diagnostics_differing_only_in_placeholders(po) def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('file', help='pot file') args = parser.parse_args() po = polib.pofile(args.file) lint_file(po) print() print('summary:') for entry in seen_warnings.most_common(): if entry[1] > 1: print(f'{entry[1]}\t{entry[0]}') if __name__ == '__main__': main()