Skip to content

gh-137353: Add t-string support to gettext + pygettext #137354

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 233 additions & 10 deletions Lib/gettext.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,13 @@
# you'll need to study the GNU gettext code to do this.


import ast
import operator
import os
import sys
from functools import cache
from string.templatelib import Interpolation, Template, convert
from typing import Any


__all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
Expand Down Expand Up @@ -290,11 +294,23 @@ def add_fallback(self, fallback):
def gettext(self, message):
if self._fallback:
return self._fallback.gettext(message)
if isinstance(message, Template):
message, values = _template_to_format(message)
return message.format(**values)
return message

def ngettext(self, msgid1, msgid2, n):
if self._fallback:
return self._fallback.ngettext(msgid1, msgid2, n)
msgid1_is_template = isinstance(msgid1, Template)
msgid2_is_template = isinstance(msgid2, Template)
if msgid1_is_template and msgid2_is_template:
message, values = _template_to_format(
msgid1 if n == 1 else msgid2
)
return message.format(**values)
elif msgid1_is_template or msgid2_is_template:
raise TypeError('msgids cannot mix strings and t-strings')
n = _as_int2(n)
if n == 1:
return msgid1
Expand All @@ -304,11 +320,23 @@ def ngettext(self, msgid1, msgid2, n):
def pgettext(self, context, message):
if self._fallback:
return self._fallback.pgettext(context, message)
if isinstance(message, Template):
message, values = _template_to_format(message)
return message.format(**values)
return message

def npgettext(self, context, msgid1, msgid2, n):
if self._fallback:
return self._fallback.npgettext(context, msgid1, msgid2, n)
msgid1_is_template = isinstance(msgid1, Template)
msgid2_is_template = isinstance(msgid2, Template)
if msgid1_is_template and msgid2_is_template:
message, values = _template_to_format(
msgid1 if n == 1 else msgid2
)
return message.format(**values)
elif msgid1_is_template or msgid2_is_template:
raise TypeError('msgids cannot mix strings and t-strings')
n = _as_int2(n)
if n == 1:
return msgid1
Expand Down Expand Up @@ -437,50 +465,104 @@ def _parse(self, fp):

def gettext(self, message):
missing = object()
orig_message = message
t_values = None
if isinstance(message, Template):
message, t_values = _template_to_format(message)
tmsg = self._catalog.get(message, missing)
if tmsg is missing:
tmsg = self._catalog.get((message, self.plural(1)), missing)
if tmsg is not missing:
if t_values is not None:
return tmsg.format(**t_values)
return tmsg
if self._fallback:
return self._fallback.gettext(message)
return self._fallback.gettext(orig_message)
if t_values is not None:
return message.format(**t_values)
return message

def ngettext(self, msgid1, msgid2, n):
orig_msgid1 = msgid1
orig_msgid2 = msgid2
msgid1_is_template = isinstance(msgid1, Template)
msgid2_is_template = isinstance(msgid2, Template)
t_values1 = t_values2 = None
if msgid1_is_template and msgid2_is_template:
msgid1, t_values1 = _template_to_format(msgid1)
msgid2, t_values2 = _template_to_format(msgid2)
elif msgid1_is_template or msgid2_is_template:
raise TypeError('msgids cannot mix strings and t-strings')
plural = self.plural(n)
t_values = t_values2 if plural else t_values1
try:
tmsg = self._catalog[(msgid1, self.plural(n))]
tmsg = self._catalog[(msgid1, plural)]
except KeyError:
if self._fallback:
return self._fallback.ngettext(msgid1, msgid2, n)
return self._fallback.ngettext(orig_msgid1, orig_msgid2, n)
if n == 1:
tmsg = msgid1
if t_values1 is not None:
return msgid1.format(**t_values1)
return msgid1
else:
tmsg = msgid2
if t_values2 is not None:
return msgid2.format(**t_values2)
return msgid2
if t_values is not None:
return tmsg.format(**t_values)
return tmsg

def pgettext(self, context, message):
orig_message = message
t_values = None
if isinstance(message, Template):
message, t_values = _template_to_format(message)
ctxt_msg_id = self.CONTEXT % (context, message)
missing = object()
tmsg = self._catalog.get(ctxt_msg_id, missing)
if tmsg is missing:
tmsg = self._catalog.get((ctxt_msg_id, self.plural(1)), missing)
if tmsg is not missing:
if t_values is not None:
return tmsg.format(**t_values)
return tmsg
if self._fallback:
return self._fallback.pgettext(context, message)
return self._fallback.pgettext(context, orig_message)
if t_values is not None:
return message.format(**t_values)
return message

def npgettext(self, context, msgid1, msgid2, n):
orig_msgid1 = msgid1
orig_msgid2 = msgid2
msgid1_is_template = isinstance(msgid1, Template)
msgid2_is_template = isinstance(msgid2, Template)
t_values1 = t_values2 = None
if msgid1_is_template and msgid2_is_template:
msgid1, t_values1 = _template_to_format(msgid1)
msgid2, t_values2 = _template_to_format(msgid2)
elif msgid1_is_template or msgid2_is_template:
raise TypeError('msgids cannot mix strings and t-strings')
plural = self.plural(n)
t_values = t_values2 if plural else t_values1
ctxt_msg_id = self.CONTEXT % (context, msgid1)
try:
tmsg = self._catalog[ctxt_msg_id, self.plural(n)]
tmsg = self._catalog[ctxt_msg_id, plural]
except KeyError:
if self._fallback:
return self._fallback.npgettext(context, msgid1, msgid2, n)
return self._fallback.npgettext(
context, orig_msgid1, orig_msgid2, n
)
if n == 1:
tmsg = msgid1
if t_values1 is not None:
return msgid1.format(**t_values1)
return msgid1
else:
tmsg = msgid2
if t_values2 is not None:
return msgid2.format(**t_values2)
return msgid2
if t_values is not None:
return tmsg.format(**t_values)
return tmsg


Expand Down Expand Up @@ -655,3 +737,144 @@ def npgettext(context, msgid1, msgid2, n):
# gettext.

Catalog = translation


# utils for t-string handling in gettext translation + pygettext extraction
# TBD where they should go, and whether this should be a public API or internal,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should limit what is exposed in gettext apart from the core API.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's why I underscored all of them for now.

FWIW, I think exposing at least the utils to convert a template string to a format string makes sense, because tools like Babel would need to use the exact same logic, or risk inconsistencies between implementations.

# especially the part about generating names from interpolations which is IMHO
# beneficial to have in stdlib so any implementation can re-use it without
# risking diverging behavior for the same expression between implementations

class _NameTooComplexError(ValueError):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This however should be IMO documented, since it is “public”. I however don’t like this, I think a general (new) gettext error (or, much simpler, a ValueError) would be clearer, thoughts, Tomas?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially I just used this to avoid catching some other (unexpected) ValueError that may or may not come out of the ast visitor. A custom exception just for this may indeed be overkill.

But I like the idea of a GettextError :)

"""
Raised when an expression is too complex to derive a format string name
from it, or the resulting name would not be valid in a format string.
"""


class _ExtractNamesVisitor(ast.NodeVisitor):
def __init__(self):
self._name_parts = []

@property
def name(self) -> str:
name = '__'.join(self._name_parts)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used __ as a separator between parts since initially I thought that it might be nice to make it clearer that the placeholder for {user.name} isn't just something named user_name.

However, maybe using a single underscore would be fine here:

  • t'{user.name} {user_name}' would simply fail due to the check that a name doesn't map to different expressions
  • I can't come up with a good example where you would use foo.bar and foo_bar in the same string

if not name.isidentifier():
raise _NameTooComplexError(
'Only expressions which can be converted to a format string '
'placeholder may be used in a gettext call; assign the '
'expression to a variable and use that instead'
)
return name

def generic_visit(self, node):
name = node.__class__.__name__
raise _NameTooComplexError(
f'Only simple expressions are supported, {name} is not allowed; '
'assign the expression to a variable and use that instead'
)

def visit_Attribute(self, node):
self.visit(node.value)
self._name_parts.append(node.attr)

def visit_Name(self, node):
self._name_parts.append(node.id)

def visit_Subscript(self, node):
self.visit(node.value)
if not isinstance(node.slice, ast.Constant):
raise _NameTooComplexError(
'Only constant value dict keys may be used in a gettext call; '
'assign the expression to a variable and use that instead'
)
self.visit(node.slice)

def visit_Constant(self, node):
self._name_parts.append(str(node.value))

def visit_Call(self, node):
self.visit(node.func)
if node.args:
raise _NameTooComplexError(
'Function calls with arguments are not supported in gettext '
'calls; assign the result to a variable and use that instead'
)


def _template_node_to_format(node: ast.TemplateStr) -> str:
"""Generate a format string from a template string AST node.
This fails with a `_NameTooComplexError` in case the expression is not
suitable for conversion.
"""
parts = []
interpolation_format_names = {}
for child in node.values:
match child:
case ast.Constant(value):
parts.append(value.replace('{', '{{').replace('}', '}}'))
case ast.Interpolation(value):
visitor = _ExtractNamesVisitor()
visitor.visit(value)
name = visitor.name
expr = ast.unparse(value)
if (
existing_expr := interpolation_format_names.get(name)
) and existing_expr != expr:
raise _NameTooComplexError(
f'Interpolations of {existing_expr} and {expr} cannot '
'be mixed in the same gettext call; assign one of '
'them to a variable and use that instead'
)
interpolation_format_names[name] = expr
parts.append(f'{{{name}}}')
return ''.join(parts)


def _template_to_format(template: Template) -> tuple[str, dict[str, Any]]:
"""Convert a template to a format string and its value dict.
This takes a :class:`~string.templatelib.Template`, and converts all the
interpolations with format string placeholders derived from the original
expression.
This fails with a `_NameTooComplexError` in case the expression is not
suitable for conversion.
"""
parts = []
interpolation_format_names = {}
values = {}
for item in template:
match item:
case str() as s:
parts.append(s.replace('{', '{{').replace('}', '}}'))
case Interpolation(value, expr, conversion, format_spec):
value = convert(value, conversion)
value = format(value, format_spec)
name = _expr_to_format_field_name(expr)
if (
existing_expr := interpolation_format_names.get(name)
) and existing_expr != expr:
raise _NameTooComplexError(
f'Interpolations of {existing_expr} and {expr} cannot '
'be mixed in the same gettext call; assign one of '
'them to a variable and use that instead'
)
interpolation_format_names[name] = expr
values[name] = value
parts.append(f'{{{name}}}')
return ''.join(parts), values


@cache
def _expr_to_format_field_name(expr: str) -> str:
# handle simple cases w/o the overhead of dealing with an ast
if expr.isidentifier():
return expr
if all(x.isidentifier() for x in expr.split('.')):
return '__'.join(expr.split('.'))
expr_node = ast.parse(expr, mode='eval').body
visitor = _ExtractNamesVisitor()
visitor.visit(expr_node)
return visitor.name
Loading
Loading