# -*- python -*-
"""Translation tags for Translate.py"""
__version__ = "0.3 (tiepin) of 1999-11-15"
__author__ = "Tibs"
# This is a back-translation of the tuples which were written by hand
# within Translate.py, *but* it is also suitable for translation (by
# said module) into Python code, and then importing into Translate.py
# for use instead of the original (hand written) tuples.
from TextTools import *
# ------------------------------------------------------------
# We are not, initially, going to try for anything very sophisticated
# - just something that will get us bootstrapped, so that I can use the
# "little language" to write more sophisticated stuff (without having
# to worry about dropped commas between tuples, and so on!)
# Whitespace is always useful
t_whitespace = \
(None,AllIn,' \t')
t_opt_whitespace = \
t_whitespace + (+1,) # move to next tuple if not whitespace
# Comments are fairly simple
t_comment = \
('comment',Table,(
(None,Is,'#'),
(None,AllNotIn,'\n\r',MatchOk),
))
# We care about the "content" of the indentation at the start of a
# line, but note that it is optional
t_indent = \
('indent',AllIn,' \t')
t_indentation = \
t_indent + (+1,) # zero indentation doesn't show
# A string is text within single or double quotes
# (of course, this is an oversimplification, because we should also
# deal with things like "This is a \"substring\"", and it would be
# nice to be able to cope with triple-quoted strings too, but it
# will do for a start)
# Major bug - doesn't recognised zero length strings...
# (since "AllNotIn" must match at least one character)
t_string = \
('str',Table,(
(None,Is,"'",+3,+1),
('text',AllNotIn,"'"),
(None,Is,"'",MatchFail,MatchOk),
(None,Is,'"'),
('text',AllNotIn,'"'),
(None,Is,'"'),
))
# An integer is a series of digits...
t_integer = \
('int',AllIn,number)
t_signed_integer = \
('signed_int',Table,(
('sign',Is,"+",+1,+2),
('sign',Is,"-",+1), # sign is optional
# <int>
t_integer,
))
# We'll only go for the simplest words
# Remember to be careful to specify the LONGEST possible match first, so that
# we try for "IsIn" before we try for "Is" (because "IsIn" would *match* "Is",
# leaving us with a spurious "In" hanging around...)
t_operation = \
('op',Table,(
('op',Word,"AllInSet",+1,MatchOk),
('op',Word,"AllIn",+1,MatchOk),
('op',Word,"AllNotIn",+1,MatchOk),
('op',Word,"CallArg",+1,MatchOk),
('op',Word,"Call",+1,MatchOk),
('op',Word,"EOF",+1,MatchOk),
('op',Word,"Fail",+1,MatchOk),
('op',Word,"IsInSet",+1,MatchOk),
('op',Word,"IsIn",+1,MatchOk),
('op',Word,"IsNotIn",+1,MatchOk),
('op',Word,"IsNot",+1,MatchOk),
('op',Word,"Is",+1,MatchOk),
('op',Word,"Jump",+1,MatchOk),
('op',Word,"LoopControl",+1,MatchOk),
('op',Word,"Loop",+1,MatchOk),
('op',Word,"Move",+1,MatchOk),
('op',Word,"NoWord",+1,MatchOk), # alias for WordStart
('op',Word,"Skip",+1,MatchOk),
('op',Word,"SubTableInList",+1,MatchOk),
('op',Word,"SubTable",+1,MatchOk),
('op',Word,"sFindWord",+1,MatchOk),
('op',Word,"sWordStart",+1,MatchOk),
('op',Word,"sWordEnd",+1,MatchOk),
('op',Word,"TableInList",+1,MatchOk),
('op',Word,"Table",+1,MatchOk),
('op',Word,"WordStart",+1,MatchOk),
('op',Word,"WordEnd",+1,MatchOk),
('op',Word,"Word",MatchFail,MatchOk),
))
# Python keywords
t_keyword = \
('keyword',Table,(
(None,Word,"and",+1,+28),
(None,Word,"assert",+1,+27),
(None,Word,"break",+1,+26),
(None,Word,"class",+1,+25),
(None,Word,"continue",+1,+24),
(None,Word,"def",+1,+23),
(None,Word,"del",+1,+22),
(None,Word,"elif",+1,+21),
(None,Word,"else",+1,+20),
(None,Word,"except",+1,+19),
(None,Word,"exec",+1,+18),
(None,Word,"finally",+1,+17),
(None,Word,"for",+1,+16),
(None,Word,"from",+1,+15),
(None,Word,"global",+1,+14),
(None,Word,"if",+1,+13),
(None,Word,"import",+1,+12),
(None,Word,"in",+1,+11),
(None,Word,"is",+1,+10),
(None,Word,"lambda",+1,+9),
(None,Word,"not",+1,+8),
(None,Word,"or",+1,+7),
(None,Word,"pass",+1,+6),
(None,Word,"print",+1,+5),
(None,Word,"raise",+1,+4),
(None,Word,"return",+1,+3),
(None,Word,"try",+1,+2),
(None,Word,"while",MatchFail,+1),
# <check>
# In order to not recognise things like "in_THIS_CASE"
# we must check that the next character is not legitimate
# within an identifier
(None,IsIn,alpha+"_"+number,+1,MatchFail),
# If it wasn't another identifier character, we need to
# unread it so that it can be recognised as something else
# (so that, for instance, "else:" is seen as "else" followed
# by ":")
(None,Skip,-1),
))
# Do the same for mxText commands
t_mxkeyword = \
('mxKeyword',Table,(
t_operation,
(None,IsIn,alpha+"_"+number,+1,MatchFail),
(None,Skip,-1),
))
# Traditional identifiers
t_identifier = \
('identifier',Table,(
t_keyword + (+1,MatchFail), # don't allow Python keywords
t_mxkeyword + (+1,MatchFail), # don't allow mxText commands
(None,IsIn,alpha+"_"), # can't start with a digit
(None,AllIn,alpha+'_'+number,MatchOk),
))
# We don't yet deal with the following with anything in parentheses,
# which means we can't handle functions or command lists, or other
# things which "look like" a tuple
t_argument = \
('arg',Table,(
('arg',Word,"Here",+1,MatchOk), # EOF Here, Fail Here
('arg',Word,"ToEOF",+1,MatchOk), # Move ToEOF
('arg',Word,"To",+1,MatchOk), # Jump To
('arg',Word,"ThisTable",+1,MatchOk), # [Sub]Table ThisTable
('arg',Word,"back",+1,MatchOk), # Skip back
('arg',Word,"Break",+1,MatchOk), # LoopControl Break
('arg',Word,"Reset",+1,MatchOk), # LoopControl Reset
t_string + (+1,MatchOk), # e.g., Word "Fred"
t_signed_integer + (+1,MatchOk), # e.g., Skip -4, Move 3
t_identifier, # e.g., Table Fred
))
# Recognise a plus sign bordered by optional whitespace
t_plus = \
('plus',Table,(
t_opt_whitespace,
(None,Is,"+"),
t_opt_whitespace,
))
# Arguments can contain "+"
t_plus_arg = \
('plusarg',Table,(
t_argument, # start with a single argument
# <again>
t_plus + (MatchOk,), # if we have a "+"
t_argument, # then we expect another argument
(None,Jump,To,-2), # then look for another "+"
))
# Match, for example:
# <fred>
t_label = \
('label',Table,(
(None,Is,"<"),
t_identifier,
(None,Is,">"),
))
# Targets for Jump and F:/T:
t_target = \
('target',Table,(
('tgt',Word,"next",+1,MatchOk),
('tgt',Word,"previous",+1,MatchOk),
('tgt',Word,"repeat",+1,MatchOk),
('tgt',Word,"MatchOk",+1,MatchOk),
('tgt',Word,"MatchOK",+1,MatchOk), # for kindness' sake
('tgt',Word,"MatchFail",+1,MatchOk),
t_label,
))
# A value is either an identifier, or a string, or an integer
t_value = \
('val',Table,(
t_identifier + (+1,MatchOk),
t_string + (+1,MatchOk),
t_integer,
))
# An assignment is (optionally) used in Tuple and Table definitions...
t_assignment = \
('assignment',Table,(
t_value,
t_opt_whitespace,
(None,Is,'='),
))
# A common error when writing tuples is to miss off the "=" sign
# - the following is used in diagnosing that (see t_bad_tuple below)
# (it's useful to have something with identical structure to the
# "real thing")
t_bad_tagobj = \
('tagobj',Table,(
t_string,
))
t_bad_assignment = \
('assignment',Table,(
t_value,
))
# This is the line that starts the definition of a single tuple.
# For the moment, restrict what it gets assigned to to a simple
# identifier.
# Match, for example:
# Fred is:
t_tupleblock = \
('tupleblock',Table,(
t_identifier,
t_whitespace,
(None,Word,"is:"),
))
# This is the line that starts a new table or sub-table.
# For the moment, we only cope with full Tables.
# NOTE that this is used for the "outer" declaration of a tag table,
# and also for the "inner" declaration of an inner table or sub-table.
# The discrimination between these is done after initial parsing.
# Match, for example:
# 'keyword' = Table is: (inner)
# tagtable = Table is: (outer)
t_tableblock = \
('tableblock',Table,(
t_assignment + (+2,+1), # left hand side is optional
t_opt_whitespace,
('type',Word,"Table",+1,+2), # Either "Table"
('type',Word,"SubTable"), # or "SubTable"
# <ok>
t_whitespace, # whitespace is required
(None,Word,"is:"), # "is:" is required
))
# This is the line that starts an "if" block
# Match, for example:
# Is "Fred":
# controlsymbol:
t_ifblock = \
('ifblock',Table,(
t_assignment + (+2,+1), # left hand side is optional
t_opt_whitespace,
t_operation + (+4,+1),
t_whitespace,
t_plus_arg,
(None,Is,":",MatchFail,MatchOk),
# Else:
t_identifier,
(None,Is,":"),
))
# Note that we don't allow spaces WITHIN our false and true thingies
t_onfalse = \
('onfalse',Table,(
t_whitespace,
(None,Word,"F:"),
t_target,
))
t_ontrue = \
('ontrue',Table,(
t_whitespace,
(None,Word,"T:"),
t_target,
))
# Valid examples are things like:
# 'fred' = Is "xxx" F:<wow> T:MatchOk
# AllIn jim T:<foundJim>
#
# For the moment, we're not trying to recognise things in any detail
t_tuple = \
('tuple',Table,(
t_assignment + (+2,+1), # left hand side is optional
t_opt_whitespace,
t_operation, # operation is required
t_whitespace, # for now, always require space here
t_plus_arg, # argument is required
t_onfalse + (+1,+1), # F:target is optional
t_ontrue + (MatchOk,MatchOk), # T:target is also optional
))
# If the user has defined a "partial" tuple, they might use something
# of the form:
# match_fred F:MatchFalse T:MatchOk
t_tupleplus = \
('tupleplus',Table,(
t_identifier,
t_onfalse + (+1,+1), # F:target is optional
t_ontrue + (MatchOk,MatchOk), # T:target is also optional
))
# Treat Jump To specially - for example:
# Jump To <top>
# so that they don't have to do the less obvious "Jump To F:<label>"
# (although that will still be recognised, of course, for people who
# are used to the tag tuple format itself)
t_jumpto = \
('jumpto',Table,(
(None,Word,"Jump"),
t_whitespace,
(None,Word,"To"),
t_whitespace,
t_target,
))
# Is it worth coping with these?
t_bad_jumpto = \
('jumpto',Table,(
(None,Word,"Jump",+3,+1), # cope with "Jump to"
t_whitespace,
(None,Word,"to",MatchFail,+2),
(None,Word,"JumpTo"), # and with "JumpTo"
# <target>
t_target,
))
# The "content" of a line is the bit after any indentation, and before
# any comment...
# For the moment, we won't try to maintain ANY context, so it is up
# to the user of the tuples produced to see if they make sense...
t_content = \
('content',Table,(
t_label + (+1,MatchOk),
t_tableblock + (+1,MatchOk), # [<tagobj> =] [Sub]Table is:
t_tupleblock + (+1,MatchOk), # <identifier> is:
t_ifblock + (+1,MatchOk), # <cmd> <arg>: OR <identifier>:
t_jumpto + (+1,MatchOk), # Jump To <target>
t_tuple + (+1,MatchOk),
t_tupleplus + (+1,MatchOk), # name [F:<label> [T:<label>]]
))
t_contentline = \
('contentline',Table,(
t_content, # something that we care about
t_opt_whitespace,
t_comment + (+1,+1), # always allow a comment
(None,IsIn,newline), # the end of the line
))
# Sometimes, the user (e.g., me) writes:
# 'fred' = Table:
# instead of:
# 'fred' = Table is:
# Unfortunately, without the "is", it would get too confusing whether
# we actually wanted an if block...
t_bad_tableblock = \
('tableblock',Table,(
t_assignment + (+2,+1), # left hand side is optional
t_opt_whitespace,
(None,Word,"Table"), # "Table" is required
(None,Is,":"), # "is" is needed before the ":"
))
# Sometimes, the use (e.g., me again) write:
# 'fred' IsIn jim
# instead of:
# 'fred' = IsIn jim
# Whilst I'm not entirely convinced that "=" is the best character
# to use here, I think we do need something!
t_bad_tuple = \
('tuple',Table,(
t_bad_assignment, # obviously we have to have this!
t_whitespace, # in which case the whitespace IS needed
t_operation, # operation is required
t_whitespace, # for the moment, we must have space here
t_plus_arg, # argument is required
t_onfalse + (+1,+1), # F:target is optional
t_ontrue + (MatchOk,MatchOk), # T:target is also optional
))
# Make some attempt to recognise common errors...
t_badcontent = \
('badcontent',Table,(
t_bad_tableblock + (+1,MatchOk),
t_bad_tuple,
))
t_badline = \
('badline',Table,(
t_badcontent, # something that we sort of care about
t_opt_whitespace,
t_comment + (+1,+1), # always allow a comment
(None,IsIn,newline), # the end of the line
))
t_emptyline = \
('emptyline',Table,(
t_opt_whitespace,
(None,IsIn,newline), # the end of the line
))
t_commentline = \
('commentline',Table,(
t_comment,
(None,IsIn,newline), # the end of the line
))
t_passthruline = \
('passthruline',Table,(
('passthru',AllNotIn,newline,+1), # anything else on the line
(None,IsIn,newline), # the end of the line
))
# Basically, a file is a series of lines
t_line = \
('line',Table,(
t_emptyline + (+1,MatchOk), # empty lines are simple enough
t_indent + (+1,+1), # optional indentation
t_commentline + (+1,MatchOk), # always allow a comment
t_contentline + (+1,MatchOk), # a line we care about
t_badline + (+1,MatchOk), # a line we think is wrong
t_passthruline, # a line we don't care about
))
# So read lines until we find the EOF
t_file = (
t_line,
(None,EOF,Here,-1),
)
# ----------------------------------------------------------------------
if __name__ == '__main__':
test_data = "#Test data\n"
def print_tuples(tuplist):
print "("
for item in tuplist:
print " ",item
print ")"
lines = string.split(test_data,"\n")
count = 0
print "Test data"
print "---------"
for line in lines:
count = count+1
print "%2d: %s"%(count,line)
print
print "Tagging text"
print "------------"
PYTAG = 0
if PYTAG:
import pytag
pytag.set_verbosity(1)
pytag.use_debugger()
result,taglist,next = pytag.pytag(test_data,t_file)
else:
timer = TextTools._timer()
timer.start()
result, taglist, next = tag(test_data,t_file)
print "Tagging took",timer.stop()[0],"seconds"
print "Result: ",result
print "Taglist:"
print_tuples(taglist)