# header file for published German lexicon # Each line of the lexicon contains the following seven fields # in the following order. The first field contains a name, the second # contains a regular expression that should match every instance of that # field. Fields are tab separated, here as well as in the lexicon. # For full documentation of the lexicon, please read the file ge_lex.doc! # Usage of this file: # simple syntax check: # parse-lex -s ge_lex.hdr ge_lex.01 # output full "parse" of fields: # parse-lex ge_lex.hdr ge_lex.01 # The "parse-lex" tool is available via FTP from ftp.ldc.upenn.edu HEADWORD [-A-ZÄÖÜßa-zäöüé_]+ # morphological and stem information # (Note that multiple entries in Celex create some complexity here.) let $morphpiece=([-A-ZÄÖÜßa-zäöüé]+(\+[1-3]?[A-Z][a-zA-Z_]*)+) MORPH $morphpiece(//$morphpiece)*(\|\|$morphpiece(//$morphpiece)*)* # pronunciation let $pronpiece=([aeiouA@EIOUWwYy&pbtdkgGmnlrfvszSZjxhV\$]|([wa\$O]~))+ PRON $pronpiece(\|\|$pronpiece)* #PRON [a-zA-Z@&\$~]+(\|\|[a-zA-Z@&\$~]+)* # stress pattern(s): 1=stressed, 0=unstressed STRESS 0*1[01]*(\|\|0*1[01]*)* # was this word in the CELEX lexicon? CELEX [01] # number of occurrences in training transcripts TRAIN [0-9]+ # number of occurrences in devtest transcripts DEV [0-9]+