Steve Cotton 6f95b062cb wmlxgettext: Fix "fix invalid escape sequence"
The original code was probably meant to strip trailing whitespace, which
turned out to be a bug when 4be9aa85849010e93a9a3b0f0701d0630e7b9368
fixed the buggy regexp so that it started working. That has now been
removed.

Fixed handling of multiline plural strings, which was broken too.

One feature has been left unimplemented, and will cause wmlxgettext to error
out. For "long bracketed" strings, Lua allows the contents to start with a
newline, which is automatically stripped from the resulting string. Trying to
understand the original purpose lead me to this feature of Lua strings which we
don't use in Wesnoth; and I finally concluded that, if someone wants to use
that feature, it can be their problem to implement it.

This fixes commit 4be9aa85849010e93a9a3b0f0701d0630e7b9368.

(cherry picked from commit e4239634e5094410478fa6b1d91df3be4ddf1caf)
2024-04-02 16:12:23 +02:00

476 lines
18 KiB
Python

import re
# import os
from pywmlx.wmlerr import wmlerr
from pywmlx.wmlerr import wmlwarn
from pywmlx.wmlerr import warnall
from pywmlx.postring import PoCommentedString
from pywmlx.postring import PoCommentedStringPL
from pywmlx.state.state import State
from pywmlx.state.lua_states import setup_luastates
from pywmlx.state.wml_states import setup_wmlstates
import pywmlx.nodemanip
import pdb
# Universe - convenient singleton for which
# `x in Universe` is always True
# Passing it to a filter is equivalent to not filtering.
class UniversalSet:
def __contains__(self, any):
return True
Universe = UniversalSet()
# --------------------------------------------------------------------
# PART 1: machine.py global variables
# --------------------------------------------------------------------
# True if --warnall option is used
_warnall = False
# True if -D option is used
_debugmode = False
# debug output file
_fdebug = None
# dictionary of pot sentences
_dictionary = None
# dictionary containing lua and WML states
_states = None
# initialdomain value (set with --initialdomain command line option)
_initialdomain = None
# the current domain value when parsing file (changed by #textdomain text)
_currentdomain = None
# the domain value (set with --domain command line option)
_domains = Universe
# this boolean value will be usually:
# True (when the file is a WML .cfg file)
# False (when the file is a .lua file)
_waitwml = True
# this boolean value is very useful to avoid a possible bug
# verified in a special case
# (see WmlGoluaState on wml_states.py for more details)
_on_luatag = False
# ---------
# pending additional infos for translators collected from # po
# or # po-override comments.
_pending_cinfo = {
# pending additional infos for translators (# po: addedinfo)
"po": None,
# pending override wmlinfo for translators (# po-override: overrideinfo)
"po-override": None,
}
# type of pending wmlinfo:
# it can be None or it can have an actual value.
# Possible actual values are: 'speaker', 'id', 'role', 'description',
# 'condition', 'type', 'race' or 'gender'
_pending_winfotype = None
# ----------
# the last function name encountered in a lua code (if any).
# If no lua functions already encountered, this var will be None
_pending_luafuncname = None
# ----------
# pending lua/wml string (they will be evaluated, and if translatable it will
# be added in _dictionary
_pending_luastring = None
_pending_wmlstring = None
# ----------
# counting line number
_current_lineno = 0
# lineno_sub helps to set the right orderid of the future PoCommentedString
_linenosub = 0
# --------------------------------------------------------------------
# PART 2: machine.py functions and classes
# --------------------------------------------------------------------
def clear_pending_infos(lineno, error=False):
global _pending_cinfo
for key in _pending_cinfo:
if error and _pending_cinfo[key] is not None:
wmlerr(pywmlx.nodemanip.fileref + ":" + str(lineno),
"#%s directive(s) not applied: %s" % (key, _pending_cinfo[key]))
_pending_cinfo[key] = None
def after_pending_info(lineno, error):
clear_pending_infos(lineno, error=error)
def checkdomain(lineno):
global _currentdomain
global _domains
if _currentdomain in _domains:
return True
else:
clear_pending_infos(lineno, error=True)
return False
def switchdomain(lineno, domain):
global _currentdomain
if _currentdomain != domain:
clear_pending_infos(lineno, error=True)
_currentdomain = domain
def checksentence(mystring, finfo, *, islua=False):
m = re.match(r'\s*$', mystring)
if m:
wmlwarn(finfo, "found an empty translatable message")
return 1
elif warnall() and not islua:
if "}" in mystring:
wmsg = ("found a translatable string containing a WML macro. "
" Translation for this string will NEVER work")
wmlwarn(finfo, wmsg)
return 2
else:
return 0
else:
return 0
# When handling a PendingLuaString, if the string has a plural version then
# this class is used for the PendingLuaString.plural object.
class PendingPlural:
def __init__(self):
self.string = ''
# status values:
# 'wait_string' --> rightly after _ ( when we need to know
# wich string type we will manage
# 'wait_plural' --> after first argument. Search for plural or
# close parenthesis
# 'wait_close' --> expect close parenthesis
self.status = 'wait_string'
# pluraltype values, used for both single-line and multiline strings
# 0: initial value, should have been changed if a string was found
# 1: delimited by double-quotes
# 2: delimited by single-quotes
# 3: delimited by long brackets, self.numequals is the level of brackets
self.pluraltype = 0
self.numequals = 0
self.ismultiline = False
def addline(self, value, isfirstline=False):
if self.pluraltype == 3 and isfirstline and value == "":
# This should be handled by not adding (self.string + '\n') on the next call,
# but someone can implement that if they start using long-bracket strings.
raise NotImplementedError("Not implemented: handling of long-bracket strings that start with a newline.")
if self.pluraltype == 3:
value = value.replace('\\', r'\\')
if isfirstline:
self.string = value
else:
self.string = self.string + '\n' + value
def convert(self):
if self.pluraltype == 2:
self.string = re.sub(r"\\\'", r"'", self.string)
if self.pluraltype != 3 and self.pluraltype!=0:
self.string = re.sub(r'(?<!\\)"', r'\"', self.string)
if self.pluraltype == 3:
self.string = self.string.replace('"', r'\"')
if self.ismultiline:
lf = r'\\n"' + '\n"'
self.string = re.sub(r'(\n\r|\r\n|[\n\r])',
lf, self.string)
self.string = '""\n"' + self.string + '"'
if not self.ismultiline:
self.string = '"' + self.string + '"'
return PoCommentedStringPL(self.string, ismultiline=self.ismultiline)
class PendingLuaString:
def __init__(self, lineno, luatype, luastring, ismultiline,
istranslatable, numequals=0, plural=None):
self.lineno = lineno
self.luatype = luatype
self.luastring = ''
self.ismultiline = ismultiline
self.istranslatable = istranslatable
self.numequals = numequals
if luatype != 'lua_plural':
self.addline(luastring, True)
self.plural = plural
def addline(self, value, isfirstline=False):
if self.luatype == 'luastr3' and isfirstline and value == "":
# This should be handled by not adding (self.string + '\n') on the next call,
# but someone can implement that if they start using long-bracket strings.
raise NotImplementedError("Not implemented: handling of long-bracket strings that start with a newline.")
if self.luatype == 'luastr3':
value = value.replace('\\', r'\\')
if isfirstline:
self.luastring = value
else:
self.luastring = self.luastring + '\n' + value
# this function is used by store, when translating lua pending plural into
# PoCommentedString.plural
def storePlural(self):
if self.plural is None:
return None
else:
return self.plural.convert()
def store(self):
global _pending_cinfo
global _linenosub
if not checkdomain(self.lineno):
return
if self.istranslatable:
_linenosub += 1
finfo = pywmlx.nodemanip.fileref + ":" + str(self.lineno)
fileno = pywmlx.nodemanip.fileno
errcode = checksentence(self.luastring, finfo, islua=True)
if errcode != 1:
# when errcode is equal to 1, the translatable string is empty
# so, using "if errcode != 1"
# we will add the translatable string ONLY if it is NOT empty
if self.luatype == 'luastr2':
self.luastring = re.sub(r"\\\'", r"'", self.luastring)
if self.luatype != 'luastr3':
self.luastring = re.sub(r'(?<!\\)"', r'\"', self.luastring)
if self.luatype == 'luastr3':
self.luastring = self.luastring.replace('"', r'\"')
loc_wmlinfos = []
loc_addedinfos = None
if _pending_cinfo["po-override"] is not None:
loc_wmlinfos.append(_pending_cinfo["po-override"])
if (_pending_luafuncname is not None and
_pending_cinfo["po-override"] is None):
winf = '[lua]: ' + _pending_luafuncname
loc_wmlinfos.append(winf)
if _pending_cinfo["po"] is None:
loc_addedinfos = []
if _pending_cinfo["po"] is not None:
loc_addedinfos = _pending_cinfo["po"]
if not _currentdomain in _dictionary:
_dictionary[_currentdomain] = dict()
loc_posentence = _dictionary[_currentdomain].get(self.luastring)
if loc_posentence is None:
_dictionary[_currentdomain][self.luastring] = PoCommentedString(
self.luastring,
_currentdomain,
orderid=(fileno, self.lineno, _linenosub),
ismultiline=self.ismultiline,
wmlinfos=loc_wmlinfos, finfos=[finfo],
addedinfos=loc_addedinfos,
plural=self.storePlural() )
else:
loc_posentence.update_with_commented_string(
PoCommentedString(
self.luastring,
_currentdomain,
orderid=(fileno, self.lineno, _linenosub),
ismultiline=self.ismultiline,
wmlinfos=loc_wmlinfos, finfos=[finfo],
addedinfos=loc_addedinfos,
plural=self.storePlural()
) )
# finally PendingLuaString.store() will clear pendinginfos
# in any case (even if the pending string is not translatable)
after_pending_info(self.lineno, not self.istranslatable)
class PendingWmlString:
def __init__(self, lineno, wmlstring, ismultiline, istranslatable, israw):
"""The israw argument indicates a << >> delimited string"""
self.lineno = lineno
self.wmlstring = wmlstring.replace('\\', r'\\')
self.ismultiline = ismultiline
self.istranslatable = istranslatable
self.israw = israw
def addline(self, value):
self.wmlstring = self.wmlstring + '\n' + value.replace('\\', r'\\')
def store(self):
global _linenosub
global _pending_cinfo
global _pending_winfotype
if _pending_winfotype is not None:
if self.ismultiline is False and self.istranslatable is False:
winf = _pending_winfotype + '=' + self.wmlstring
pywmlx.nodemanip.addWmlInfo(winf)
_pending_winfotype = None
if not checkdomain(self.lineno):
return
if self.istranslatable:
finfo = pywmlx.nodemanip.fileref + ":" + str(self.lineno)
errcode = checksentence(self.wmlstring, finfo, islua=False)
if errcode != 1:
# when errcode is equal to 1, the translatable string is empty
# so, using "if errcode != 1"
# we will add the translatable string ONLY if it is NOT empty
_linenosub += 1
if self.israw:
self.wmlstring = re.sub('"', r'\"', self.wmlstring)
else:
self.wmlstring = re.sub('""', r'\"', self.wmlstring)
pywmlx.nodemanip.addNodeSentence(self.wmlstring,
domain=_currentdomain,
ismultiline=self.ismultiline,
lineno=self.lineno,
lineno_sub=_linenosub,
override=_pending_cinfo["po-override"],
addition=_pending_cinfo["po"])
after_pending_info(self.lineno, not self.istranslatable)
def addstate(name, value):
global _states
if _states is None:
_states = {}
_states[name.lower()] = value
def setup(dictionary, initialdomain, domains, wall, fdebug):
global _dictionary
global _initialdomain
global _domains
global _warnall
global _debugmode
global _fdebug
_dictionary = dictionary
_initialdomain = initialdomain
if domains is not None:
_domains = set(domains)
_warnall = wall
_fdebug = fdebug
if fdebug is None:
_debugmode = False
else:
_debugmode = True
setup_luastates()
setup_wmlstates()
def run(*, filebuf, fileref, fileno, startstate, waitwml=True):
global _states
global _current_lineno
global _linenosub
global _waitwml
global _currentdomain
global _dictionary
global _pending_luafuncname
global _on_luatag
_pending_luafuncname = None
_on_luatag = False
# cs is "current state"
cs = _states.get(startstate)
cs_debug = startstate
_current_lineno = 0
_linenosub = 0
_waitwml = waitwml
_currentdomain = _initialdomain
pywmlx.nodemanip.newfile(fileref, fileno)
# debug_cs = startstate
try:
for xline in filebuf:
xline = xline.strip('\n\r')
_current_lineno += 1
# on new line, debug file will write another marker
if _debugmode:
print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@',
file=_fdebug)
while xline is not None:
# print debug infos (if debugmode is on)
if _debugmode:
lno = '%05d' % _current_lineno
print('---------------------------------------------------',
file=_fdebug)
print('LINE', lno, '|', xline, file=_fdebug)
# action number is used to know what function we should run
action = 0
v = None
m = None
if cs.regex is None:
# action = 1 --> execute state.run
action = 1
if _debugmode:
print('ALWAYS-RUN x', cs_debug, file=_fdebug)
else:
# m is match
m = re.match(cs.regex, xline)
if m:
# action = 1 --> execute state.run
action = 1
if _debugmode:
print('RUN state \\', cs_debug, file=_fdebug)
else:
# action = 2 --> change to the state pointed by
# state.iffail
action = 2
if _debugmode:
print('FAIL state |', cs_debug, file=_fdebug)
if action == 1:
# xline, ns: xline --> override xline with new value
# ns --> value of next state
xline, ns = cs.run(xline, _current_lineno, m)
cs_debug = ns
cs = _states.get(ns)
else:
cs_debug = cs.iffail
cs = _states.get(cs.iffail)
# end while xline
# end for xline
except UnicodeDecodeError as e:
if "test_cve_2018_1999023_2.cfg" in pywmlx.nodemanip.fileref:
# This unit test is allowed to contain invalid UTF-8. Ignore it.
return
errpos = int(e.start) # error position on file object with UTF-8 error
errbval = hex(e.object[errpos]) # value of byte wich causes UTF-8 error
# well... when exception occurred, the _current_lineno value
# was not updated at all due to the failure of the try block.
# (it is = 0)
# this means we need to make a workaround to obtain in what line of the
# file the problem happened.
# In order to perform this task (and not only) we create a temporary
# string wich contains all the file text UNTIL the UTF-8
untilerr_buf = e.object[0:errpos] # buffer containing file text
untilerr = "".join(map(chr, untilerr_buf))
# splituntil will be a array of strings (each item is a line of text).
# the last item will show the point where the invalid UTF-8 character
# was found.
splituntil = untilerr.split('\n')
# error line is equal of lines of text until error occurs (number of
# items on splituntil string array)
errlineno = len(splituntil)
# finally we can know the actual file info
finfo = pywmlx.nodemanip.fileref + ":" + str(errlineno)
errmsg = (
"UTF-8 Format error.\nCan't decode byte " + str(errbval) + ' (' +
e.reason + ').\n' +
'Probably your file is not encoded with UTF-8 encoding: you ' +
'should open the file with an advanced text editor, and re-save ' +
'it with UTF-8 encoding.\n' +
'To avoid this problem in the future, you might want to set ' +
'the default encoding of your editor to UTF-8.\n\n' +
'Text preceding the invalid byte (source file, line ' +
str(errlineno) + '):\n' + splituntil[-1] + '\n'
)
wmlerr(finfo, errmsg)
pywmlx.nodemanip.closefile(_dictionary, _current_lineno)