# -*- Mode: Python -*- """Examples.tag -- Tag metalanguage examples Original examples from Marc-Andre Lemburg, translated into the metalanguage by Tony J Ibbs, at various dates up to and including 1999-10-12 Note that these are *examples*, and not all can actually be parsed by the current translator... My comments start "#TJI" All original examples are: (c) Copyright Marc-Andre Lemburg; All Rights Reserved. See the documentation for further information on copyrights, or contact the author (mal@lemburg.com). """ """ RTF - tag a RTF string (Version 0.2) [alternative version] This version does recursion using the ThisTable special argument to the Table cmd. """ numeral = ( # sign ? (None,Is,'-',+1), (None,AllIn,number), ) # XXX: doesn't know how to handle \bin et al. with embedded {} ctrlword = ( ('name',AllIn,a2z), # name (None,Is,' ',+1,MatchOk), # delimiter (None,IsIn,number+'-',MatchOk), (None,Skip,-1), # unread the previous character ('param',Table,numeral,0,MatchOk), (None,Is,' ',+1,MatchOk), (None,Skip,-1), # unread the previous character ) hex = set(number+'abcdefABCDEF') notalpha = set(alpha,0) #TJI ------------------------------------------------------------ #TJI The original text for ctrlsymbol is: #TJI #TJI ctrlsymbol = ( #TJI (None,Is,"'",+3), # hexquote #TJI (None,IsInSet,hex), #TJI (None,IsInSet,hex), #TJI (None,IsInSet,notalpha,+1,MatchOk) # other #TJI ) #TJI #TJI A first transformation of the orginal text gives us: #TJI #TJI ctrlsymbol = Table is: #TJI Is "'" F:<4> # hexquote #TJI IsInSet hex #TJI IsInSet hex #TJI <4> #TJI IsInSet notalpha F:next T:MatchOk # other #TJI #TJI which quickly further transforms to become: ctrlsymbol = ( (None,Is,"'",+3,+1), # hexquote (None,IsInSet,hex), (None,IsInSet,hex), (None,IsInSet,notalpha,+1,MatchOk), # other ) #TJI which I think is a lot clearer, except that the last line could #TJI presumably be better written as: #TJI IsInSet notalpha F:MatchOK T:MatchOk # other #TJI ------------------------------------------------------------ rtf = ( # <top> # this is a label at the top of the definition (None,Is,'\\',+3,+1), # control ? ('word',Table,ctrlword,+1,-1), # word ('symbol',Table,ctrlsymbol,+1,-2), # symbol (None,Is,'}',+2,+1), # closing group (None,Skip,-1,0,MatchOk), # nested group (None,Is,'{',+4,+1), # recurse ('group',Table,ThisTable), # using ourselves (None,Is,'}'), (None,Jump,To,-8), ('text',AllNotIn,'\\{}',+1,-9), # document text ('eof',EOF,Here), # EOF or fail ) """ HTML - tag a HTML string (Version 0.6)""" # ErrorTag error = '***syntax error' # error tag obj tagname_set = set(alpha+'-'+number) tagattrname_set = set(alpha+'-'+number) tagvalue_set = set('"\'> ',0) white_set = set(' \r\n\t') tagattr = ( ('name',AllInSet,tagattrname_set), # name (None,AllInSet,white_set,+1), # skip junk (None,Is,'=',MatchOk), # with value ? (None,AllInSet,white_set,+1), # skip junk ('value',AllInSet,tagvalue_set,+1,MatchOk), # unquoted value (None,Is,'"',+4,+1), # double quoted value ('value',AllNotIn,'"'), (None,Is,'"'), (None,Jump,To,MatchOk), (None,Is,"'",+3,+1), # single quoted value ('value',AllNotIn,"'"), (None,Is,"'"), ) valuetable = ( # ignore whitespace + '=' #[ignored]#AllInSet set(' \r\n\t=') F:next # unquoted value ('value',AllInSet,tagvalue_set,+1,MatchOk), # double quoted value (None,Is,'"',+4,+1), ('value',AllNotIn,'"'), (None,Is,'"'), (None,Jump,To,MatchOk), # single quoted value (None,Is,"'",+3,+1), ('value',AllNotIn,"'"), (None,Is,"'"), ) allattrs = ( # <top> # look for attributes (None,AllInSet,white_set,+4,+1), (None,Is,'>',+1,MatchOk), ('tagattr',Table,tagattr), (None,Jump,To,-3), (None,Is,'>',+1,MatchOk), # handle incorrect attributes (error,AllNotIn,'> \r\n\t'), (None,Jump,To,-6), ) htmltag = ( (None,Is,'<',+21,+1), # is this a closing tag ? ('closetag',Is,'/',+1), # a comment ? ('comment',Is,'!',+4,+1), (None,Word,'--'), #[ignored]#'text' = sWordStart BMS('-->') F:next (None,Skip,3), (None,Jump,To,MatchOk), # a SGML-Tag ? ('other',AllNotIn,'>',+1), (None,Is,'>'), (None,Jump,To,MatchOk), # XMP-Tag ? ('tagname',Word,'XMP',+4,+1), (None,Is,'>'), ('text',WordStart,'</XMP>'), #[ignored]#Skip len('</XMP>') (None,Jump,To,MatchOk), # get the tag name ('tagname',AllInSet,tagname_set), # look for attributes # <huntAttributes> (None,AllInSet,white_set,+4,+1), (None,Is,'>',+1,MatchOk), ('tagattr',Table,tagattr), (None,Jump,To,-3), (None,Is,'>',+1,MatchOk), # handle incorrect attributes (error,AllNotIn,'> \n\r\t'), (None,Jump,To,-6), ) htmltable = ( # <top> # HTML-Tag ('htmltag',Table,htmltag,+1,+4), # not HTML, but still using this syntax: error or inside XMP-tag ! (error,Is,'<',+3,+1), (error,AllNotIn,'>',+1), (error,Is,'>'), # <text> # normal text ('text',AllNotIn,'<',+1), # end of file ('eof',EOF,Here,-5), ) """ Loop - loop examples (Version 0.1)""" #TJI ------------------------------------------------------------ #TJI Looping must be susceptible to a better representation, but since I don't #TJI have any documentation for it, I'll leave it alone for now... #TJI ------------------------------------------------------------ # use Loop to match a certain number of subtags table1 = ( (None,Word,'loop '), # match <= 5 stars # <loop> ('loop',Loop,5,+4,+1), (None,Is,'*',+1,-1), (None,LoopControl,Break), (None,Jump,To,-3), # must end with a dot (None,Is,'.'), ) # use Loop to tag subsections of a tagging table, i.e. # emulate a Table-match table2 = ( ('presection',AllNotIn,'(',+1), # match a group of characters enclosed in () # <loop> ('section',Loop,1,+4,+1), (None,Is,'('), (None,AllNotIn,')'), (None,Is,')',0,-3), # must end with a dot (None,Is,'.'), ) # read in all chars and then do lots of null loops table3 = ( ('Loops',Loop,10000,MatchOk), #[ignored]#AllNotIn '' F:previous T:previous ) #TJI which is identical in effect to: #TJI #TJI table3a = Table is: #TJI 'Loops' = Loop 10000: #TJI AllNotIn '' F:previous T:previous #TJI #TJI which is presumably the same as: #TJI #TJI table3b = Table is: #TJI <loop> #TJI 'Loops' = Loop 10000: #TJI AllNotIn '' F:<loop> T:<loop> """ Python - tag table for Python (Version 0.6) * 0.5->0.6: changed the names of the tags ! fixed bug in match_str() XXX can't handle (lambda ...) and misses not in 'if x is not' """ comment = ( ('comment',Table,( (None,Is,'#'), (None,AllNotIn,'\n\r',MatchOk), )), ) whitespace = \ (None,AllIn,' \t') opt_whitespace = \ whitespace + (MatchOk,) identifier = ( ('identifier',Table,( (None,IsIn,alpha+'_'), (None,AllIn,alpha+'_'+number,MatchOk), )), ) #TJI Note that CallArg,(<args>) gets translated to CallArg(<args>) in our #TJI translated form (also note that this is not actually supported yet) string = ( ('str',Table,( # hints #[ignored]#IsIn '\"\'' (None,Skip,-1), # now let's see what we have... (None,Word,'"""',+4,+1), ('string',NoWord,'"""',+1), (None,Word,'"""'), (None,Jump,To,MatchOk), (None,Word,"'''",+4,+1), ('string',NoWord,"'''",+1), (None,Word,"'''"), (None,Jump,To,MatchOk), (None,Is,'"',+3,+1), #[ignored]#'string' = CallArg(match_str,'"') F:next (None,Word,'"'), (None,Jump,To,MatchOk), (None,Is,"'"), #[ignored]#'string' = CallArg(match_str,"'") F:next (None,Word,"'"), (None,Jump,To,MatchOk), )), ) skw = ["del", "from", "lambda", "return", "and", "elif", "global", "not", "try", "break", "else", "if", "or", "while", "except", "import", "pass", "continue", "finally", "in", "print", "for", "is", "raise"] keywords = word_in_list(skw) # note: '=lambda x:...' and '(lambda x:...' are not recognized, # yet '= lambda x:...' and '( lambda x:...' are (just like in # emacs python-mode) ! keyword = ( ('kw',Table,( (None,AllIn,' \t\n\r'), # hints (None,IsIn,alpha), (None,Skip,-1), # one in the list keywords ('keyword',Table,keywords,+3,+1), (None,IsIn,': \t\n\r'), (None,Jump,To,MatchOk), # a function declaration ('keyword',Word,'def',+12,+1), whitespace, identifier, (None,Is,'('), # scan parameters # <startTuple> ('parameter',AllNotIn,'(),',+2,+1), # are there more ? (None,Is,',',+1,-1), # tuple in param-list ? (None,Is,'(',+1,-2), # maybe we're done (None,Is,')'), # to make sure... (None,Is,',',+1,-4), (None,Is,')',+1), # test for correct syntax (None,IsIn,': \t\n\r'), (None,Jump,To,MatchOk), # a class declaration: ('keyword',Word,'class',+8,+1), whitespace, identifier, (None,Is,'(',MatchOk), # scan base-classes ('baseclass',AllNotIn,'),',+2), # are there more ? (None,Is,',',+1,-1), # we're done # <done> (None,Is,')'), (None,IsIn,': \t\n\r'), )), ) python_script = ( # <top> comment + (+1,0), string + (+1,-1), keyword + (+1,-2), # end-of-file ? (None,EOF,Here,+1,MatchOk), # skip uninteresting chars and restart (None,IsIn,any), #[ignored]#AllNotIn '#\'\"_ \n\r\t' F:<top> T:<top> ) """ RTF - tag a RTF string (Version 0.2) This version does recursion using the TableInList cmd. """ # list of tables (hack to be able to do recursion) tables = [] # indices rtf_index = 0 numeral = ( # sign ? (None,Is,'-',+1), (None,AllIn,number), ) # XXX: doesn't know how to handle \bin et al. with embedded {} ctrlword = ( # name ('name',AllIn,a2z), # delimiter (None,Is,' ',+1,MatchOk), (None,IsIn,number+'-',+5,+1), (None,Skip,-1), ('param',Table,numeral,0,MatchOk), (None,Is,' ',+1,MatchOk), (None,Skip,-1), ) hex = set(number+'abcdefABCDEF') notalpha = set(alpha,0) ctrlsymbol = ( # hexquote (None,Is,"'",+3,+1), (None,IsInSet,hex), (None,IsInSet,hex), # other (None,IsInSet,notalpha,+1,MatchOk), ) rtf = ( # <top> # control ? (None,Is,'\\',+3,+1), # word ('word',Table,ctrlword,+1,-1), # symbol ('symbol',Table,ctrlsymbol,+1,-2), # closing group (None,Is,'}',+2,+1), (None,Skip,-1,0,MatchOk), # nested group (None,Is,'{',+3,+1), # recurse #[ignored]#'group' = TableInList(tables,rtf_index) #TJI Not yet supported (None,Is,'}'), (None,Jump,To,-7), # document text ('text',AllNotIn,'\\{}',+1,-8), # EOF ('eof',EOF,Here), ) # add tables to list tables.append(rtf) """ Example for dynamic programming with Tag Tables... originated from a posting to comp.lang.python by Tim Peters: [Tim] > [Marc-Andre] > I can stick in any matching function I want, so I might even > let re.match() do some of the work. That should get me pretty close > to their semantics -- ok, I can't do it all the way: Sure you can: just let re.match() do *all* the work! Presto, tables are as powerful as re. > e.g. I currently don't have registers so back-references to already > matched groups will probably not work without reanalysing them. So you have trouble recognizing e.g. the language of the form <tag> ... </tag> where "tag" can be any (say) arbitrary alphanumeric string? <S> <Like> this clause is in that language </Like>, <but> this clause isn't <but/>, while the whole sentence is -- if you ignore the trailing period </S>. It's even better if you can do computation on backreferences and use the results to guide further parsing. E.g., recognizing Fortran Hollerith strings requires this (a string of digits, followed by "H" or "h", followed by any string of characters whose length is equal to the decimal value of the string of digits; and that's too hard for regexps too). teasingly y'rs - tim """ #TJI The original code is: #TJI TIM = ( #TJI # Check starting tag #TJI (opening_tag,Table+CallTag, #TJI ((None,Is,'<'), #TJI (None,AllInSet,alphanumeric_set), #TJI (None,Is,'>'), #TJI )), #TJI # Find closing tag #TJI ('text',TableInList,(tables,0)), #TJI # For completeness mark the closing tag too #TJI (closing_tag,Table+CallTag, #TJI ((None,Word,'</'), #TJI (None,AllInSet,alphanumeric_set), #TJI (None,Is,'>'), #TJI )), #TJI ) #TJI This translates as the following, which can't yet be translated... #TJI TIM = Table is: #TJI # Check starting tag #TJI opening_tag = Table+CallTag is: #TJI This is not supported yet #TJI Is '<' #TJI AllInSet alphanumeric_set #TJI Is '>' #TJI # Find closing tag #TJI 'text' = TableInList(tables,0) #TJI This is not supported yet #TJI # For completeness mark the closing tag too #TJI closing_tag = Table+CallTag is: #TJI This is not supported yet #TJI Word '</' #TJI AllInSet alphanumeric_set #TJI Is '>' """ Words - tag words in a string (Version 0.2) """ lcwords = [] cwords = [] #TJI The commented out portions cannot yet be translated... #TJI lower_case_word = Table is: #TJI lcwords = AppendToTag+Table is: #TJI This is not supported yet #TJI # first char in word #TJI IsIn a2z+umlaute #TJI # all other chars (if there are any) #TJI AllIn german_alpha F:MatchOk #TJI capital_word = Table is: #TJI cwords = AppendToTag+Table is: #TJI This is not supported yet #TJI # first char in word #TJI IsIn A2Z+Umlaute #TJI # all other chars (if there are any) #TJI AllIn german_alpha F:MatchOk tag_words = ( # <top> lower_case_word + (+1,+2), capital_word + (+1,), # <after> (None,AllIn,white+newline,+1), (None,AllNotIn,german_alpha+white+newline,+1), # uninteresting (None,EOF,Here,-4), # EOF )