wesnoth/data/tools/wesnoth/wmlparser.py
Michał Janiszewski 9d333bc95a Compare to None using identity is operator
This is a trivial change that replaces `==` operator with `is` operator, following PEP 8 guideline:

> Comparisons to singletons like None should always be done with is or is not, never the equality operators.

https://legacy.python.org/dev/peps/pep-0008/#programming-recommendations
2018-10-31 12:21:01 +11:00

932 lines
34 KiB
Python
Executable File

#!/usr/bin/env python2
# encoding: utf-8
import wmldata, os, glob, sys
import re
"""
NOTE: You should use wmlparser2.py instead which uses the C++
preprocessor.
Module implementing a WML parser in pure python.
"""
class Error(Exception):
"""
This is a custom exception the parser throws on errors. It can display
the position of the error as a tree of all files and macros leading to
the error, with line numbers.
"""
def __init__(self, parser, text):
self.text = "%s:%d: %s" % (parser.filename, parser.line, text)
for i in xrange(len(parser.texts)):
parent = parser.texts[-1 - i]
self.text += "\n " + " " * i + "from %s:%d" % (parent.filename, parent.line)
def __str__(self):
return self.text
class Parser:
"""
The main parser class. An instance of this is needed for parsing.
"""
class Macro:
"""Class to hold one single macro."""
def __init__(self, name, params, text, textdomain):
self.name, self.params, self.text, self.textdomain =\
name, params, text, textdomain
class TextState:
def __init__(self, filename, text, textpos, line, current_path,
textdomain):
self.filename, self.text, self.textpos, self.line =\
filename, text, textpos, line
self.current_path = current_path
self.textdomain = textdomain
def __init__(self, data_dir, user_dir=None, no_macros_in_string=False):
"""
Initialize a new WMLParser instance.
data_dir is used for resolving {filepath} and {@filepath}
user_dir is used for resolving {~filepath} and {@filepath}
See https://www.wesnoth.org/wiki/PreprocessorRef
"""
self.data_dir = data_dir
self.user_dir = user_dir
self.no_macros_in_string = no_macros_in_string
self.textpos = 0
self.line = 1
self.macros = {}
self.texts = []
self.text = ""
self.filename = ""
self.current_path = "."
self.textdomain = ""
# Callback which is called for each macro. If it returns False, the
# macro is ignored.
self.macro_callback = None
self.macro_not_found_callback = None
self.no_macros = False
# If set to a function, the function is called with the current
# textdomain and string as arguments for any translatable string, and
# is expected to return a translation.
self.gettext = None
# A list containing the stacked up #ifdefs.
self.preprocessor_nesting = []
self.stay_in_file = False
# If set, included files are only parsed when under the given directory.
self.only_expand_pathes = []
# Whether to print current file, comments, and macro replacements.
self.verbose = False
def read_encoded(self, filename):
"""
Helper for gracefully handling non-utf8 files and fixing up non-unix
line endings.
"""
try:
text = file(filename).read()
except IOError:
sys.stderr.write("Cannot open file %s!\n" % filename)
return ""
try:
u = text.decode("utf8")
except UnicodeDecodeError:
u = text.decode("latin1")
text = u
text = text.replace("\r\n", "\n").replace("\t", " ").replace("\r", "\n")
if text == "" or text[-1] != "\n":
text += "\n"
return text
def set_macro_not_found_callback(self, callback):
"""
You can set a last-resort function which is called when a macro could
not be resolved by the Parser. The calling format is:
callback(wmlparser, name, params)
"""
self.macro_not_found_callback = callback
def parse_file(self, filename):
"""
Set the parser to parse the given file.
"""
text = self.read_encoded(filename)
self.push_text(filename, text, cd = os.path.dirname(filename))
def parse_stream(self, stream, binary = False):
"""
Set the parser to parse from a file object.
"""
text = stream.read()
if not binary:
text = text.replace("\r\n", "\n").replace("\t", " ").replace("\r", "\n")
self.push_text("inline", text)
def parse_text(self, text, binary = False):
"""
Set the parser to directly parse from the given string.
"""
if not binary:
text = text.replace("\r\n", "\n").replace("\t", " ")
self.push_text("inline", text)
def push_text(self, filename, text, params = None, cd = None,
initial_textdomain = ""):
"""
Recursively parse a sub-document, e.g. when a file is included or a
macro is executed.
"""
if self.verbose:
sys.stderr.write("%s:%d: Now parsing %s.\n" % (self.filename,
self.line, filename))
if not text: text = "\n"
self.texts.append(self.TextState(self.filename, self.text, self.textpos,
self.line, self.current_path, self.textdomain))
self.filename, self.text, self.params = filename, text, params
self.textpos = 0
self.line = 1
self.textdomain = initial_textdomain
if cd: self.current_path = cd
def pop_text(self):
"""
Finish the current text and return to parsing the caller.
"""
textstate = self.texts.pop()
self.filename, self.text, self.textpos, self.line =\
textstate.filename, textstate.text, textstate.textpos, textstate.line
self.current_path = textstate.current_path
self.textdomain = textstate.textdomain
if self.verbose:
sys.stderr.write("%s:%d: Back.\n" % (self.filename, self.line))
def get_macros(self):
"""
Return a list of all macros currently known to the parser.
"""
return self.macros
def add_macros(self, macros):
"""
Add additional macros to the parser.
"""
for macro in macros:
self.macros[macro] = macros[macro]
def read_next(self):
"""Read the next character, taking care of \r and \t."""
c = self.text[self.textpos]
self.textpos += 1
if c == "\n":
self.line += 1
if self.textpos == len(self.text):
if len(self.texts) and not self.stay_in_file: self.pop_text()
return c
def at_end(self):
"""
Return True if the parser is at the very end of the input, that is the
last character of the topmost input text has been read.
"""
return (len(self.texts) == 0
or self.stay_in_file) and self.textpos == len(self.text)
def check_end(self):
if self.textpos == len(self.text):
if len(self.texts): self.pop_text()
def peek_next(self):
"""Like read_next, but does not consume."""
if self.textpos >= len(self.text):
if len(self.texts) and not self.stay_in_file:
ts = self.texts[-1]
if ts.textpos >= len(ts.text): return ""
return ts.text[ts.textpos]
return self.text[self.textpos]
def read_until(self, sep):
"""Read until a character inside the string sep is found."""
mob = re.compile(".*?[" + sep + "]", re.S).match(self.text, self.textpos)
if mob:
found = mob.group(0)
self.line += found.count("\n")
self.textpos = mob.end(0)
if self.textpos == len(self.text) and not self.stay_in_file:
if len(self.texts): self.pop_text()
return found
else:
found = self.text[self.textpos:]
self.line += found.count("\n")
self.textpos = len(self.text)
if len(self.texts) and not self.stay_in_file:
self.pop_text()
found += self.read_until(sep)
return found
def read_while(self, sep):
"""Read while characters are inside the string sep."""
text = ""
while not self.at_end():
c = self.peek_next()
if not c in sep:
return text
c = self.read_next()
text += c
return text
def skip_whitespace_and_newlines(self):
self.read_while(" \t\r\n")
preprocessor_commands = ["#define", "#undef", "#textdomain", "#ifdef",
"#ifndef", "#else", "#enddef", "#endif"]
def read_lines_until(self, string):
"""
Read lines until one contains the given string, but throw away any
comments.
"""
text = ""
in_string = False
while 1:
if self.at_end():
return None
line = self.read_until("\n")
line_start = 0
if in_string:
string_end = line.find('"')
if string_end < 0:
text += line
continue
in_string = False
line_start = string_end + 1
elif line.lstrip().startswith("#"):
possible_comment = line.lstrip()
for com in self.preprocessor_commands:
if possible_comment.startswith(com):
break
else:
continue
quotation = line.find('"', line_start)
while quotation >= 0:
in_string = True
string_end = line.find('"', quotation + 1)
if string_end < 0: break
line_start = string_end + 1
in_string = False
quotation = line.find('"', line_start)
if not in_string:
end = line.find(string, line_start)
if end >= 0:
text += line[:end]
break
text += line
return text
def read_lua(self):
"""
Read a << .... >> string, return contents literally.
"""
text = ""
while not self.at_end():
c = self.read_next()
if c == ">" and self.peek_next() == ">":
self.read_next() #get rid of trailing >
return text
text += c
raise Error(self, "Unexpected end of file")
def skip_whitespace_inside_statement(self):
self.read_while(" \t\r\n")
if not self.at_end():
c = self.peek_next()
if c == "#":
for command in self.preprocessor_commands:
if self.check_for(command): return
self.read_until("\n")
self.skip_whitespace_inside_statement()
def skip_whitespace(self):
self.read_while(" ")
def check_for(self, str):
"""Compare the following text with str."""
return self.text[self.textpos:self.textpos + len(str)] == str
def parse_macro(self):
"""No recursive macro processing is done here. If a macro is passed as
parameter to a macro, then whoever parses the macro replacement will do
the macro recursion.
Actually - I'm too tired right now to think this through. Maybe it
should be done the other way around, starting expansion with the
innermost macro?
"""
params = []
balance = 1
macro = ""
while balance > 0:
macro += self.read_until("{}")
if macro[-1] == "{":
balance += 1
elif macro[-1] == "}":
balance -= 1
else:
raise Error(self, "Unclosed macro")
return
preserve = macro
macro = macro[:-1] # Get rid of final }
if self.macro_callback:
if not self.macro_callback(macro): return None
# If the macro starts with ~, assume a file in userdata.
if macro[0] == "~":
if self.user_dir:
dirpath = self.user_dir + "/" + macro[1:]
else:
dirpath = None
# If the macro starts with @, look first in data then in userdata.
elif macro[0] == "@":
if self.data_dir:
dirpath = self.data_dir + "/" + macro[1:]
if not os.path.exists(dirpath): dirpath = None
else:
dirpath = None
if not dirpath and self.user_dir:
dirpath = self.user_dir + "/" + macro[1:]
# If the macro starts with ., look relative to the currently parsed
# file.
elif macro[0] == ".":
dirpath = self.current_path + macro[1:]
# Otherwise, try to interpret the macro as a filename in the data dir.
elif self.data_dir is not None:
dirpath = self.data_dir + "/" + macro
else:
dirpath = None
if dirpath is not None and os.path.exists(dirpath):
dirpath = os.path.normpath(dirpath)
if self.only_expand_pathes:
if not [x for x in self.only_expand_pathes if os.path.commonprefix([dirpath, x]) == x]:
return None
# If it is a directory, parse all cfg files within.
if os.path.isdir(dirpath):
# Note: glob.glob will try to return unicode filenames
# if you pass it an unicode string - but to deal with
# non-unicode filenames as are allowed in linux we
# convert to a byte-string.
# Execute all WML files in the directory.
files = glob.glob(str(dirpath + "/*.cfg"))
# And also execute directories with a _main.cfg.
files += glob.glob(str(dirpath + "/*/_main.cfg"))
files.sort()
mc = dirpath + "/_main.cfg"
fc = dirpath + "/_final.cfg"
if mc in files:
# If there's a file called _main.cfg, only parse that.
files = [mc]
elif fc in files:
# If there's a file called _final.cfg, parse it only after
# all others.
files.remove(fc)
files.append(fc)
else:
files = [dirpath]
files.reverse()
for path in files:
self.push_text(path, self.read_encoded(path), cd = os.path.dirname(path))
return None
# It's OK for user directories not to exist.
# Nothing prefixed with ~ can be a macro.
if macro.startswith("~") and not self.verbose:
return None
# No file was found, try to do macro expansion. First, push the
# macro call again. E.g. {blah 1 2 3}.
self.push_text("macro", preserve)
# Find all parameters.
while 1:
read = self.read_until('"}{ (\n')
if not read:
sys.stderr.write("? %s\n" % macro)
sys.stderr.write(" (%s)\n" % params)
raise Error(self, "Unexpected end of file")
break
sep = read[-1]
read = read[:-1]
if sep == "}":
if read: params += [read]
break
elif sep == "{":
balance = 1
param = sep + read
while balance:
c = self.read_next()
if c == "{":
balance += 1
elif c == "}":
balance -= 1
param += c
params += [param]
elif sep == '"':
# Cannot parse strings here.. it must be passed to the macro as
# is, so the real string parser can handle it. Else there will
# be subtle bugs, e.g. when a MACRO evaluates to a + at the end
# of line.
read += '"' + self.read_until('"')
params += [read]
elif sep == "(":
in_string = False
balance = 1
param = read
while balance:
c = self.read_next()
# Ignore () within strings
if c == '"':
in_string = not in_string
if not in_string:
if c == "(":
balance += 1
elif c == ")":
balance -= 1
param += c
params += [param[:-1]]
else:
if read:
params += [read]
self.read_while(" \n")
if self.no_macros:
return wmldata.DataMacro("macro", " ".join(params))
name = params[0]
if name in self.macros:
macro = self.macros[name]
text = macro.text
for i, j in enumerate(macro.params):
if 1 + i >= len(params):
raise Error(self, "Not enough parameters for macro %s. " % name +
"%d given but %d needed %s." % (len(params) - 1,
len(macro.params), macro.params))
rep = params[1 + i]
# Handle gettext replacement here, since inside the macro
# the textdomain will be wrong.
if self.gettext and rep and rep[0] == "_":
q = rep.find('"')
qe = rep.find('"', q + 1)
rep = self.gettext(self.textdomain, rep[q + 1:qe])
rep = '"' + rep + '"'
if self.verbose:
#s = "Replacing {%s} with %s\n" % (j, rep)
##sys.stderr.write(s.encode("utf8"))
pass
text = text.replace("{%s}" % j, rep)
if text:
self.push_text(name, text, initial_textdomain = macro.textdomain)
else:
pass # empty macro, nothing to do
else:
if self.macro_not_found_callback:
keep_macro = self.macro_not_found_callback(self, name, params)
if keep_macro: return keep_macro
if self.verbose:
sys.stderr.write("No macro %s.\n" % name.encode("utf8"))
if self.verbose:
sys.stderr.write(" (%s:%d)\n" % (self.filename, self.line))
return name
return None
def parse_string(self):
text = ""
match_read_end = '"'
if not self.no_macros_in_string:
match_read_end += '{'
while not self.at_end():
text += self.read_until(match_read_end)
if text[-1] == '"':
if self.peek_next() == '"':
self.read_next()
else:
return text[:-1]
elif text[-1] == '{':
text = text[:-1]
not_found = self.parse_macro()
if not isinstance(not_found, wmldata.Data):
if not_found:
text += not_found
else:
break
raise Error(self, "Unclosed string")
def parse_inside(self, data, c):
variables = []
values = []
variable = ""
value = ""
got_assign = False
spaces = ""
filename = "(None)"
line = -1
got_lua = False
while 1:
if c == "{":
keep_macro = self.parse_macro()
if keep_macro:
if self.no_macros:
values += [keep_macro.name]
else:
values += [keep_macro]
elif c == "\n":
break
elif c == "#":
# FIXME
# Assume something like:
# name="picture.png="+
# #textdomain blah
# "translateable name"
#
# For now, we ignore the textdomain...
self.read_until("\n")
break
elif c == "+":
value = value.rstrip() # remove whitespace before +
self.skip_whitespace_inside_statement() # read over newline
elif not got_assign:
if c == "=":
variables += [variable.rstrip()]
got_assign = True
translatable = False
filename = self.filename
line = self.line
self.skip_whitespace()
else:
if c == ",":
variables += [variable]
variable = ""
else:
variable += c
else:
if c == "<" and self.peek_next() == "<":
self.read_next() #skip the rest of the opening
value += self.read_lua()
elif c == '"':
# We want the textdomain at the beginning of the string,
# the end of the string may be outside a macro and already
# in another textdomain.
textdomain = self.textdomain
# remove possible _
i = len(value)
while i > 0:
i -= 1
if value[i] != " ": break
got_underscore = False
if value and value[i] == "_":
got_underscore = True
translatable = True
# remove whitespace before _
while i > 1:
if value[i - 1] != " ": break
i -= 1
value = value[:i]
string = self.parse_string()
if got_underscore:
if self.gettext:
string = self.gettext(textdomain, string)
value += string
spaces = ""
else:
if c == "," and len(values) + 1 < len(variables):
values += [value]
value = ""
spaces = ""
elif c == " ":
spaces += c
else:
if spaces:
value += spaces
spaces = ""
value += c
if self.at_end(): break
c = self.read_next()
if not got_assign:
raise Error(self, "= expected for \"%s\"" % variable)
return []
values += [value]
data = []
for i in xrange(len(variables)):
try:
key = wmldata.DataText(variables[i], values[i], translatable)
key.set_meta(filename, line)
data.append(key)
except IndexError:
raise Error(self, "Assignement does not match: %s = %s" % (
str(variables), str(values)))
return data
def parse_top(self, data, state = None):
while 1:
self.skip_whitespace_and_newlines()
if self.at_end():
if state:
raise Error(self, "Tag stack non-empty (%s) at end of data" % state)
break
c = self.read_next()
if c == "#": # comment or preprocessor
if self.check_for("define "):
self.read_until(" ")
params = []
while 1:
name = self.read_until(" \n")
sep = name[-1]
name = name[:-1]
if name: params += [name]
if sep == "\n": break
self.read_while(" ")
text = self.read_lines_until("#enddef")
if text is None:
raise Error(self, "#define without #enddef")
self.macros[params[0]] = self.Macro(
params[0], params[1:], text, self.textdomain)
if self.verbose:
sys.stderr.write("New macro: %s.\n" % params[0])
elif self.check_for("undef "):
self.read_until(" ")
name = self.read_until("\n")
name = name.rstrip()
if " " in name:
if self.verbose: sys.stderr.write("Stray symbols in #undef %s\n" % name)
name = name.split(" ")[0]
if name in self.macros: del self.macros[name]
elif self.verbose: sys.stderr.write("undef'd macro '%s' did not exist\n" % name)
elif self.check_for("ifdef ") or self.check_for("ifndef"):
what = "#" + self.read_until(" ").rstrip()
name = self.read_until(" \n")
if name[-1] == " ": self.read_while(" \n")
name = name[:-1]
condition_failed = False
if what == "#ifdef":
if name in self.macros:
pass
else:
condition_failed = True
else: # what == "#ifndef"
if not name in self.macros:
pass
else:
condition_failed = True
self.preprocessor_nesting.append((what, condition_failed))
# If the condition is true, we simply continue parsing. At
# some point we will either hit an #else or #endif, and
# things continue there. If the condition failed, we skip
# over everything until we find the matching #else or
# endif.
if condition_failed:
self.stay_in_file = True
balance = 1
while balance > 0 and not self.at_end():
line = self.read_until("\n")
line = line.lstrip()
if line.startswith("#ifdef"): balance += 1
if line.startswith("#ifndef"): balance += 1
if line.startswith("#endif"): balance -= 1
if line.startswith("#else"):
if balance == 1:
balance = -1
break
self.stay_in_file = False
if balance == 0:
self.preprocessor_nesting.pop()
if balance > 0:
raise Error(self, "Missing #endif for %s" % what)
self.check_end()
elif self.check_for("else"):
if not self.preprocessor_nesting:
raise Error(self, "#else without #ifdef")
self.read_until("\n")
# We seen an #else - that means we are at the end of a
# conditional preprocessor block which has executed. So
# we should now ignore everything up to the #endif.
balance = 1
self.stay_in_file = True
while balance > 0 and not self.at_end():
line = self.read_until("\n")
line = line.lstrip()
if line.startswith("#ifdef"): balance += 1
if line.startswith("#ifndef"): balance += 1
if line.startswith("#endif"): balance -= 1
self.stay_in_file = False
if balance != 0:
raise Error(self, "Missing #endif for #else")
self.check_end()
elif self.check_for("endif"):
if not self.preprocessor_nesting:
raise Error(self, "#endif without #ifdef")
self.preprocessor_nesting.pop()
self.read_until("\n")
elif self.check_for("textdomain"):
self.read_until(" ")
self.textdomain = self.read_until("\n").strip()
else: # comment
line = self.read_until("\n")
#comment = c + line
if self.verbose:
#msg = "Comment removed: %s" % comment
##sys.stderr.write(msg.encode("utf8"))
pass
elif c == '[':
name = self.read_until("]")[:-1]
if name[0] == '/':
if state == name[1:]:
return
raise Error(self, "Mismatched closing tag [%s], expected [/%s]" % (name, state))
elif name[0] == '+':
name = name[1:]
try:
subdata = data.dict[name][-1]
self.parse_top(subdata, name)
except KeyError:
subdata = wmldata.DataSub(name)
subdata.set_meta(self.filename, self.line)
self.parse_top(subdata, name)
data.insert(subdata)
elif name[0] == " ":
# We single this case out explicitly as the wesnoth parser
# ignores it due to implementation reasons and this makes
# more sense of the error message
raise Error(self, "Invalid tag [%s] contains whitespace" % name)
else:
subdata = wmldata.DataSub(name)
subdata.set_meta(self.filename, self.line)
self.parse_top(subdata, name)
data.insert(subdata)
elif c == '{':
keep_macro = self.parse_macro()
if isinstance(keep_macro, wmldata.Data):
data.insert(keep_macro)
else:
for subdata in self.parse_inside(data, c):
data.insert(subdata)
import json
def jsonify(tree, verbose=False, depth=0):
"""
Convert a DataSub into JSON
If verbose, insert a linebreak after every brace and comma (put every item on its own line), otherwise, condense everything into a single line.
"""
print "{",
first = True
sdepth1 = "\n" + " " * depth
sdepth2 = sdepth1 + " "
for child in tree.children():
if first:
first = False
else:
sys.stdout.write(",")
if verbose:
sys.stdout.write(sdepth2)
print'"%s":' % child.name,
if child.get_type() == "DataSub":
jsonify(child, verbose, depth + 1)
else:
print json.dumps(child.get_value()),
if verbose:
sys.stdout.write(sdepth1)
sys.stdout.write("}")
from xml.sax.saxutils import escape
def xmlify(tree, verbose=False, depth=0):
sdepth = ""
if verbose:
sdepth = " " * depth
for child in tree.children():
if child.get_type() == "DataSub":
print '%s<%s>' % (sdepth, child.name)
xmlify(child, verbose, depth + 1)
print '%s</%s>' % (sdepth, child.name)
else:
if "\n" in child.get_value() or "\r" in child.get_value():
print sdepth + '<' + child.name + '>' + \
'<![CDATA[' + child.get_value() + ']]>' + '</' + child.name + '>'
else:
print sdepth + '<' + child.name + '>' + \
escape(child.get_value()) + '</' + child.name + '>'
if __name__ == "__main__":
import argparse, subprocess
try: import psyco
except ImportError: pass
else: psyco.full()
# Hack to make us not crash when we encounter characters that aren't ASCII
sys.stdout = __import__("codecs").getwriter('utf-8')(sys.stdout)
argumentparser = argparse.ArgumentParser("usage: %(prog)s [options]")
argumentparser.add_argument("-p", "--path", help = "specify wesnoth data path")
argumentparser.add_argument("-C", "--color", action = "store_true",
help = "use colored output")
argumentparser.add_argument("-u", "--userpath", help = "specify userdata path")
argumentparser.add_argument("-e", "--execute", help = "execute given WML")
argumentparser.add_argument("-v", "--verbose", action = "store_true",
help = "make the parser very verbose")
argumentparser.add_argument("-n", "--no-macros", action = "store_true",
help = "do not expand any macros")
argumentparser.add_argument("-c", "--contents", action = "store_true",
help = "display contents of every tag")
argumentparser.add_argument("-j", "--to-json", action = "store_true",
help = "output JSON version of tree")
argumentparser.add_argument("-x", "--to-xml", action = "store_true",
help = "output XML version of tree")
argumentparser.add_argument("filename", nargs = "?",
help = "file to parse")
args = argumentparser.parse_args()
if args.path:
path = args.path
else:
try:
p = subprocess.Popen(["wesnoth", "--path"], stdout = subprocess.PIPE)
path = p.stdout.read().strip()
path = os.path.join(path, "data")
except OSError:
sys.stderr.write("Could not determine Wesnoth path.\n")
path = None
wmlparser = Parser(path, args.userpath)
if args.no_macros:
wmlparser.no_macros = True
if args.verbose:
wmlparser.verbose = True
def gt(domain, x):
print "gettext: '%s' '%s'" % (domain, x)
return x
wmlparser.gettext = gt
wmlparser.do_preprocessor_logic = True
if args.execute:
wmlparser.parse_text(args.execute)
elif args.filename:
wmlparser.parse_file(args.filename)
else:
wmlparser.parse_stream(sys.stdin)
data = wmldata.DataSub("WML")
wmlparser.parse_top(data)
if args.to_json:
jsonify(data, True) # For more readable results
elif args.to_xml:
xmlify(data, True)
else:
data.debug(show_contents = args.contents, use_color = args.color)