# -*- Mode: Python -*-
"""Examples.tag -- Tag metalanguage examples

 Original examples from Marc-Andre Lemburg, translated into the metalanguage
 by Tony J Ibbs, at various dates up to and including 1999-10-12

 Note that these are *examples*, and not all can actually be parsed by the
 current translator...

 My comments start "#TJI"

 All original examples are:
    (c) Copyright Marc-Andre Lemburg; All Rights Reserved.
    See the documentation for further information on copyrights,
    or contact the author (mal@lemburg.com).
"""


""" RTF - tag a RTF string (Version 0.2) [alternative version]

    This version does recursion using the ThisTable special argument
    to the Table cmd.
"""
        
numeral = Table is:
    # sign ?
    Is '-' F:next
    AllIn number
        
# XXX: doesn't know how to handle \bin et al. with embedded {}
        
ctrlword = Table is:
    'name' = AllIn a2z          # name
    Is ' ' F:next T:MatchOk     # delimiter
    IsIn number+'-' F:MatchOk
    Skip back                   # unread the previous character
    'param' = Table numeral F:repeat T:MatchOk
    Is ' ' F:next T:MatchOk
    Skip back                   # unread the previous character

hex = set(number+'abcdefABCDEF')
notalpha = set(alpha,0)
        
#TJI ------------------------------------------------------------
#TJI The original text for ctrlsymbol is:
#TJI
#TJI       ctrlsymbol = (
#TJI                     (None,Is,"'",+3),         # hexquote
#TJI                      (None,IsInSet,hex),
#TJI                      (None,IsInSet,hex),
#TJI                     (None,IsInSet,notalpha,+1,MatchOk) # other
#TJI                    )
#TJI
#TJI A first transformation of the orginal text gives us:
#TJI
#TJI       ctrlsymbol = Table is:
#TJI           Is "'" F:<4>                # hexquote
#TJI           IsInSet hex
#TJI           IsInSet hex
#TJI           <4>
#TJI           IsInSet notalpha F:next T:MatchOk   # other
#TJI
#TJI which quickly further transforms to become:

ctrlsymbol = Table is:
    Is "'":                             # hexquote
        IsInSet hex
        IsInSet hex
    IsInSet notalpha F:next T:MatchOk   # other

#TJI which I think is a lot clearer, except that the last line could
#TJI presumably be better written as:
#TJI    IsInSet notalpha F:MatchOK T:MatchOk       # other
#TJI ------------------------------------------------------------

rtf = Table is:
    <top>       # this is a label at the top of the definition
    Is '\\':                                            # control ?
        'word'   = Table ctrlword   F:next T:previous   # word
        'symbol' = Table ctrlsymbol F:next T:<top>      # symbol
    Is '}':                                             # closing group
        Skip back F:repeat T:MatchOk                    # nested group
    Is '{':                                             # recurse
        'group' = Table ThisTable                       # using ourselves
        Is '}'
        Jump To <top>
    'text' = AllNotIn '\\{}' F:next T:<top>             # document text
    'eof'  = EOF Here                                   # EOF or fail
        

""" HTML - tag a HTML string (Version 0.6)"""
        

# ErrorTag
error = '***syntax error'                       # error tag obj
        
tagname_set = set(alpha+'-'+number)
tagattrname_set = set(alpha+'-'+number)
tagvalue_set = set('"\'> ',0)
white_set = set(' \r\n\t')
        
tagattr = Table is:
    'name' = AllInSet tagattrname_set   # name
    AllInSet white_set F:next           # skip junk
    Is '=' F:MatchOk                    # with value ?
    AllInSet white_set F:next           # skip junk
    'value' = AllInSet tagvalue_set F:next T:MatchOk    # unquoted value
    Is '"':                             # double quoted value
        'value' = AllNotIn '"'
        Is '"'
        Jump To MatchOk
    Is "'":                             # single quoted value
        'value' = AllNotIn "'"
        Is "'"
        
valuetable = Table is:
    # ignore whitespace + '='
    AllInSet set(' \r\n\t=') F:next
    # unquoted value
    'value' = AllInSet tagvalue_set F:next T:MatchOk
    # double quoted value
    Is '"':
        'value' = AllNotIn '"'
        Is '"'
        Jump To MatchOk
    # single quoted value
    Is "'":
        'value' = AllNotIn "'"
        Is "'"

allattrs = Table is:
    <top>
    # look for attributes
    AllInSet white_set:
        Is '>' F:next T:MatchOk
        'tagattr' = Table tagattr
        Jump To <top>
    Is '>' F:next T:MatchOk
    # handle incorrect attributes
    error = AllNotIn '> \r\n\t'
    Jump To <top>
        
htmltag = Table is:
    Is '<':
        # is this a closing tag ?
        'closetag' = Is '/' F:next
        # a comment ?
        'comment' = Is '!':
            Word '--'
            'text' = sWordStart BMS('-->') F:next
            Skip 3
            Jump To MatchOk
        # a SGML-Tag ?
        'other' = AllNotIn '>' F:next
        Is '>'
        Jump To MatchOk
        # XMP-Tag ?
        'tagname' = Word 'XMP':
            Is '>'
            'text' = WordStart '</XMP>'
            Skip len('</XMP>')
            Jump To MatchOk
        # get the tag name
        'tagname' = AllInSet tagname_set
        # look for attributes
        <huntAttributes>
        AllInSet white_set:
            Is '>' F:next T:MatchOk
            'tagattr' = Table tagattr
            Jump To <huntAttributes>
        Is '>' F:next T:MatchOk
        # handle incorrect attributes
        error = AllNotIn '> \n\r\t'
        Jump To <huntAttributes>

htmltable = Table is:
    <top>
    # HTML-Tag
    'htmltag' = Table htmltag F:next T:<text>
    # not HTML, but still using this syntax: error or inside XMP-tag !
    error = Is '<':
        error = AllNotIn '>' F:next
        error = Is '>'
    <text>    # normal text
    'text' = AllNotIn '<' F:next
    # end of file
    'eof' = EOF Here F:<top>
        

""" Loop - loop examples (Version 0.1)"""

#TJI ------------------------------------------------------------
#TJI Looping must be susceptible to a better representation, but since I don't
#TJI have any documentation for it, I'll leave it alone for now...
#TJI ------------------------------------------------------------
        
# use Loop to match a certain number of subtags
table1 = Table is:
    Word 'loop '
    # match <= 5 stars
    <loop>
    'loop' = Loop 5:
        Is '*' F:next T:previous
        LoopControl Break
        Jump To <loop>
    # must end with a dot
    Is '.'
        
# use Loop to tag subsections of a tagging table, i.e.
# emulate a Table-match
table2 = Table is:
    'presection' = AllNotIn '(' F:next
    # match a group of characters enclosed in ()
    <loop>
    'section' = Loop 1:
        Is '('
        AllNotIn ')'
        Is ')' F:repeat T:<loop>
    # must end with a dot
    Is '.'
        
# read in all chars and then do lots of null loops
table3 = Table is:
    'Loops' = Loop 10000 F:MatchOK
    AllNotIn '' F:previous T:previous

#TJI which is identical in effect to:
#TJI
#TJI table3a = Table is:
#TJI    'Loops' = Loop 10000:
#TJI        AllNotIn '' F:previous T:previous
#TJI
#TJI which is presumably the same as:
#TJI
#TJI table3b = Table is:
#TJI     <loop>
#TJI     'Loops' = Loop 10000:
#TJI         AllNotIn '' F:<loop> T:<loop>


""" Python - tag table for Python (Version 0.6)

    * 0.5->0.6: changed the names of the tags !
                fixed bug in match_str()

    XXX can't handle (lambda ...) and misses not in 'if x is not'
"""

comment = Table is:
    'comment' = Table is:
        Is '#'
        AllNotIn '\n\r' F:MatchOk
        
whitespace is:
    AllIn ' \t'
opt_whitespace is:
    whitespace F:MatchOk
        
identifier =Table is:
    'identifier' = Table is:
        IsIn alpha+'_'
        AllIn alpha+'_'+number F:MatchOk

#TJI Note that CallArg,(<args>) gets translated to CallArg(<args>) in our
#TJI translated form (also note that this is not actually supported yet)

string = Table is:
    'str' = Table is:
        # hints
        IsIn '\"\''
        Skip back
        # now let's see what we have...
        Word '"""':
            'string' = NoWord '"""' F:next
            Word '"""'
            Jump To MatchOk
        Word "'''":
            'string' = NoWord "'''" F:next
            Word "'''"
            Jump To MatchOk
        Is '"':
            'string' = CallArg(match_str,'"') F:next
            Word '"'
            Jump To MatchOk
        Is "'"
        'string' = CallArg(match_str,"'") F:next
        Word "'"
        Jump To MatchOk
        
skw = ["del", "from", "lambda", "return", "and", "elif",
       "global", "not", "try", "break", "else", "if", "or", "while",
       "except", "import", "pass", "continue", "finally", "in", "print",
       "for", "is", "raise"]
keywords = word_in_list(skw)
        
# note: '=lambda x:...' and '(lambda x:...' are not recognized,
#       yet '= lambda x:...' and '( lambda x:...' are (just like in
#       emacs python-mode) !
        
keyword = Table is:
    'kw' = Table is:
        AllIn ' \t\n\r'
        # hints
        IsIn alpha
        Skip back
        # one in the list keywords
        'keyword' = Table keywords:
            IsIn ': \t\n\r'
            Jump To MatchOk
        # a function declaration
        'keyword' = Word 'def':
            whitespace
            identifier
            Is '('
            # scan parameters
            <startTuple>
            'parameter' = AllNotIn '(),':
                # are there more ?
                Is ',' F:next T:<startTuple>
            # tuple in param-list ?
            Is '(' F:next T:<startTuple>
            # maybe we're done
            Is ')'
            # to make sure...
            Is ',' F:next T:<startTuple>
            Is ')' F:next
            # test for correct syntax
            IsIn ': \t\n\r'
            Jump To MatchOk
        # a class declaration:
        'keyword' = Word 'class':
            whitespace
            identifier
            Is '(' F:MatchOk
            # scan base-classes
            'baseclass' = AllNotIn '),' F:<done>
            # are there more ?
            Is ',' F:next T:previous
            # we're done
            <done>
            Is ')'
            IsIn ': \t\n\r'

python_script = Table is:
    <top>
    comment F:next T:repeat
    string F:next T:previous
    keyword F:next T:<top>
    # end-of-file ?
    EOF Here F:next T:MatchOk
    # skip uninteresting chars and restart
    IsIn any
    AllNotIn '#\'\"_ \n\r\t' F:<top> T:<top>
        

""" RTF - tag a RTF string (Version 0.2)
    
    This version does recursion using the TableInList cmd.
"""
        
# list of tables (hack to be able to do recursion)
tables = []

# indices
rtf_index = 0
        
numeral = Table is:
    # sign ?
    Is '-' F:next
    AllIn number
        
# XXX: doesn't know how to handle \bin et al. with embedded {}
        
ctrlword = Table is:
    # name
    'name' = AllIn a2z
    # delimiter
    Is ' ' F:next T:MatchOk
    IsIn number+'-':
        Skip back
        'param' = Table numeral F:repeat T:MatchOk
        Is ' ' F:next T:MatchOk
        Skip back
        
hex = set(number+'abcdefABCDEF')
notalpha = set(alpha,0)
        
ctrlsymbol = Table is:
    # hexquote
    Is "'":
        IsInSet hex
        IsInSet hex
    # other
    IsInSet notalpha F:next T:MatchOk
        
rtf = Table is:
    <top>
    # control ?
    Is '\\':
        # word
        'word' = Table ctrlword F:next T:previous
        # symbol
        'symbol' = Table ctrlsymbol F:next T:<top>
    # closing group
    Is '}':
        Skip back F:repeat T:MatchOk
    # nested group
    Is '{':
        # recurse
        'group' = TableInList(tables,rtf_index) #TJI Not yet supported
        Is '}'
        Jump To <top>
    # document text
    'text' = AllNotIn '\\{}' F:next T:<top>
    # EOF
    'eof' = EOF Here
        
# add tables to list
tables.append(rtf)
        

""" Example for dynamic programming with Tag Tables... originated from
    a posting to comp.lang.python by Tim Peters:

 [Tim]
 > [Marc-Andre]
 > I can stick in any matching function I want, so I might even
 > let re.match() do some of the work. That should get me pretty close
 > to their semantics -- ok, I can't do it all the way:

 Sure you can:  just let re.match() do *all* the work!  Presto, tables are as
 powerful as re.

 > e.g. I currently don't have registers so back-references to already
 > matched groups will probably not work without reanalysing them.

 So you have trouble recognizing e.g. the language of the form

   <tag> ... </tag>

 where "tag" can be any (say) arbitrary alphanumeric string?  <S> <Like> this
 clause is in that language </Like>, <but> this clause isn't <but/>, while
 the whole sentence is -- if you ignore the trailing period </S>.  It's even
 better if you can do computation on backreferences and use the results to
 guide further parsing.  E.g., recognizing Fortran Hollerith strings requires
 this (a string of digits, followed by "H" or "h", followed by any string of
 characters whose length is equal to the decimal value of the string of
 digits; and that's too hard for regexps too).

 teasingly y'rs  - tim
 """

#TJI The original code is:
#TJI       TIM = (
#TJI           # Check starting tag
#TJI           (opening_tag,Table+CallTag,
#TJI            ((None,Is,'<'),
#TJI             (None,AllInSet,alphanumeric_set),
#TJI             (None,Is,'>'),
#TJI             )),
#TJI           # Find closing tag
#TJI           ('text',TableInList,(tables,0)),
#TJI           # For completeness mark the closing tag too
#TJI           (closing_tag,Table+CallTag,
#TJI            ((None,Word,'</'),
#TJI             (None,AllInSet,alphanumeric_set),
#TJI             (None,Is,'>'),
#TJI             )),
#TJI       )

#TJI This translates as the following, which can't yet be translated...

#TJI TIM = Table is:
#TJI     # Check starting tag
#TJI     opening_tag = Table+CallTag is: #TJI This is not supported yet
#TJI         Is '<'
#TJI         AllInSet alphanumeric_set
#TJI         Is '>'
#TJI     # Find closing tag
#TJI     'text' = TableInList(tables,0)  #TJI This is not supported yet
#TJI     # For completeness mark the closing tag too
#TJI     closing_tag = Table+CallTag is: #TJI This is not supported yet
#TJI         Word '</'
#TJI         AllInSet alphanumeric_set
#TJI         Is '>'
        

""" Words - tag words in a string (Version 0.2) """
        
lcwords = []
cwords = []

#TJI The commented out portions cannot yet be translated...

#TJI lower_case_word = Table is:
#TJI     lcwords = AppendToTag+Table is: #TJI This is not supported yet
#TJI         # first char in word
#TJI         IsIn a2z+umlaute
#TJI         # all other chars (if there are any)
#TJI         AllIn german_alpha F:MatchOk
   
#TJI capital_word = Table is:
#TJI     cwords = AppendToTag+Table is: #TJI This is not supported yet
#TJI         # first char in word
#TJI         IsIn A2Z+Umlaute
#TJI         # all other chars (if there are any)
#TJI         AllIn german_alpha F:MatchOk
        
tag_words = Table is:
    <top>
    lower_case_word F:next T:<after>
    capital_word F:next
    <after>
    AllIn white+newline F:next
    AllNotIn german_alpha+white+newline F:next  # uninteresting
    EOF Here F:<top>                            # EOF