# header file for published Mandarin lexicon. # Each line of the lexicon contains the following seven fields # in the following order. The first field contains a name, the second # contains a regular expression that should match every instance of that # field. Fields are tab separated, here as well as in the lexicon. # For full documentation of the lexicon, please read the file ma_lex.doc! # Usage of this file: # simple syntax check: # parse-lex -s ma_lex.hdr ma_lex.v03 # output full "parse" of fields: # parse-lex ma_lex.hdr ma_lex.v03 # The "parse-lex" tool is available via FTP from ftp.ldc.upenn.edu let $mchar=([¡-þ]{2}) HEADWORD (T)?$mchar+ PINYIN [a-z:]+[1-5]( [a-z:]+[1-5])* TONE [1-5]( [1-5])* # pronunciation PRON [a-zA-Z@&%>]+( [a-zA-Z@&%>]+)* # number of occurrences in Xinhua newswire XINHUA [0-9]+ # number of occurrences in training transcripts TRAIN [0-9]+ # part-of-speech tagging let $pos=(acronym|adj|adj_r|adv|adv_r|class|class_r|conj|for_name|interj|name|name_affix|noun|number|onom|part|part_struc|part_asp|part_final|phrase|prep|pro|surname|surname_affix|verb|verb_r|number_class|name_seg|affix) POS $pos(/$pos)*/?