World Book - Encyclopedia of Science

home *** CD-ROM | disk | FTP | other *** search

/ World Book - Encyclopedia of Science / WBScience.iso / DATA / QryConfig / Verity / sample.lng < prev next >

Wrap

Text File | 1998-04-26 | 17KB | 553 lines

# # Sample locale file for VDK 2.0. # $control: 1 locale: { # # Description: Inherit attributes from a different locale. All keywords # in this lng file will overwrite any attributes from the parent # locale. This provides a type of "subclass" mechanism for locales. For # example, the French Canadian locale might inherit from the French # locale, but over-ride the upper and lower case tables. # # Default: The locale "english" which implements basic english # functionality in charset 850 # # This keyword is optional inherit: "locale_name" # # Description: This gives the major and minor version number of the # the current locale. This information is stamped into collections # when they are created so that you can tell which version of a # locale the collection is created with. # # Default: The default version is 1.0 # # These keywords are optional Major-Version: number Minor-Version: number # # Description: For numeric formatting, this tells the search engine to # use either a comma or a period as the decimal point. # # Default: period # # This keyword is optional decimal: "dot_or_comma" # # Description: Load in a library of routines to implement functional # callbacks for this locale. Fill in a normal DDA spec for a library. # The optional name specifies a name to use in the tokenizers section # below. (q.v.) # # Example: # driver: "DLL:french:InitFrenchLocale" # # This keyword is optional driver: "dda_spec" [ "name" ] ... # # Description: Flags that give an indication to the engine what settings # to use as the default for the current locale. # # Default: for each of the settings is "no" # locale-flags: { # # query parser flags -- these flags affect the way the default query # parser works # # Automatically turn on case-sensitive search when a capital letter # is found in the query string? NoAutoCase: yes/no # Use the locale's tokenizer (either the DLL function or the lex rules) # when tokenizing a blob of text in the query? # No will cause the engine to use its built-in tokenizer. QueryTok: yes/no # Automatically generate an expanded search when a punctuation character # is found in a search term? For example, "AT&T" is expanded to search # for a number of possible variations (tokenizations): # # <Any>(<Many><Stem>`AT&T`,<Many><Phrase>(<Many><Stem>`AT`,<Many><Stem>`T`), # <Many><Phrase>(<Many><Stem>`AT`,<Many><Stem>`&`,<Many><Stem>`T`)) NoAutoPhrase: yes/no # # tokenization flags -- these flags affect the way text is tokenized # in this locale # # Use the engine's built-in 8bit lexer to tokenize any unprocessed # text chunks returned by this locale's functional tokenizer. If this # is on, the tokenize driver may defer tokenization of ASCII # passages to the engine by returning these passages as VdkBuffer # tokens. For example, a multibyte tokenizer can return sections of # embedded English text as buffer tokens, letting the engine tokenize # them with its built-in lexer. NeedAsciiTok: yes/no } # # Description: The following strings are used to identify a locale. # locale-def: { # # Description: Name of this locale IF and only if it is different from # the name of the subdirectory under common for this locale. This # keyword was designed to be used with the "default" locale. # # Default: name of current locale dir # # This keyword is optional name: "locale_name" # # Description: Name of this language. This should be a 2 letter string # in a ISO 639 coding. # # Default: language of parent locale (or english "en" when no parent # is explicitly specified.) # # This keyword is optional langName: "language_name" # # Description: Name of the country. This should be a 2 letter string # in a ISO 3166 coding. # # Default: country of parent locale (or "US" when no parent # is explicitly specified.) # # This keyword is optional country: "country_name" # # Description: Name of the character set for this locale. All strings # that this locale will manipulate are written in this charset, and all # tables and data in this lng file are written in this charset. # # Default: charset of parent locale (or 850 when no parent # is explicitly specified.) # # This keyword is optional charset: "charset_name" # # Description: name of the subdialect of the given language. # # Default: dialect of parent locale # # This keyword is optional dialect: "dialect_name" # # Description: name of the supplier of this locale. This string # can be any arbitrary string, including your company name. # # Default: supplier of parent locale # # This keyword is optional supplier: "supplier_name" } # # Description: Specify the tokenizers for various purposes. # # Equivalent: style.lex rules # # Default: the hard-wired 8bit lexer for English. If you do not specify # a tokenizer for any particular purpose, then it will use the tokenizer # you specify for the "default" purpose. If you don't specify the "default" # purpose, then it will use the built-in hard-wired 8bit lexer for English. # # WARNING! Tokenization is a very sensitive process. If you use different # tokenizers for different purposes, your highlighting is likely to be # off. DO NOT CHANGE TOKENIZERS UNLESS YOU REALLY KNOW WHAT YOU ARE DOING! # A good solution is to set only one tokenizer for the "stream" purposes # and possibly one for the "extract" purposes, as these are pretty # independent. The best solution for most accurate highlighting is to # use one tokenizer for all purposes. Set this using the "default" # purpose. (q.v.) # # This keyword is optional. tokenizers: { # # Description: name the tokenizer to be used for indexing the # document. # # WARNING! If this tokenizer is different than the "View" tokenizer, # you will VERY likely have your highlights be off. Do not set this # differently unless you know what you are doing. You may want to # change the "stream" tokenizer instead, which sets the tokenizer # for all of the "index", "view", and "dynamichl" purposes. Index: args # # Description: name the tokenizer to be used for viewing the # document using VdkDocStreamRead. (This includes viewing with # highlights from the full-text index.) # # WARNING! If this tokenizer is different than the "Index" tokenizer, # you will VERY likely have your highlights be off. Do not set this # differently unless you know what you are doing. You may want to # change the "stream" tokenizer instead, which sets the tokenizer # for all of the "index", "view", and "dynamichl" purposes. View: args # # Description: name the tokenizer to be used for parsing strings # in a BooleanPlus query. # # WARNING! If this tokenizer is different than the "Index" tokenizer, # then query words will be less likely to match words in the index # and your recall rate will be less. Do not change this tokenizer # unless you know what you are doing! Query: args # # Description: name the tokenizer to be used for tokenizing a # document that is being summarized. Summarize: args # # Description: name the tokenizer to be used for tokenizing a # document that is being used as a query-by-example document. QBE: args # # Description: name the tokenizer to be used for tokenizing a # free-text query. FTQP: args # # Description: name the tokenizer to be used for tokenizing a # document that is being highlighted dynamically. DynamicHL: args # # Description: short form for naming the tokenizer used for all # of the Summarize, FTQP, and QBE purposes Extract: args # # Description: short form for naming the tokenizer used for all # of the Index, View, and DynamicHL purposes Stream: args # # Description: name the tokenizer to be used as a default if # any of the above purposes are not explicitly set to a particular # tokenizer Default: args # # The arguments to the above can be one of the following: # # driver "driverName" # - get the functional tokenizer from the named driver # # HWLEX # - use the internal hard-wired lexer # # LEX # - a set of style.lex type of rules is included to specify # a new lexer # This is an 8bit table and can only be used for locales # that specify a character set that is of type singlebyte. # These lex rules have the same syntax as the style.lex file rules. # Note: If you are defining a multibyte locale, you must # provide the above functions in the locale driver. # If you are in a single byte locale, you can optionally write the # above functions in the driver, but a the table below is more # efficient. # # Example 1: for a knowledgable localizer. This example might give # highlighting problems because it uses different tokenizers for # different purposes. # # $control: 1 # locale: # { # # named drivers # driver: "insowrap" "inso" # driver: "veritytok" "verity" # # tokenizers: # { # Extract: driver "inso" # # Index: driver "verity" # View: driver "verity" # # DynamicHL: LEX # { # define: NL "[ \t]*\n" # # token: WORD "[A-Za-z0-9]+" # word # token: WORD "[0-9]+\\.[0-9]+" # word # token: EOS "[.?!]" # end of sentence # token: NEWLINE "{NL}" # single end-of-line # token: EOP "{NL}({NL})+" # end of paragraph # token: TAB "\t+" # tab # token: WHITE " +" # whitespace # token: PUNCT "." # all other text # } # # Default: HWLEX # } # } # # Example 2: for basic tokenization. This example gives the best # overall highlighting fidelity because the same tokenizer is # used everywhere. # # $control: 1 # locale: # { # # named drivers # driver: "insowrap" "inso" # # tokenizers: # { # # use this tokenizer for everything # Default: driver "inso" # } # } } # # Description: Specify the attributes of each character. (The "ctype" # table for those that know C.) # # Each entry is a bit field combination of the following: # LOC_UP 0x01 /* upper case letter */ # LOC_LW 0x02 /* lower case letter */ # LOC_NM 0x04 /* digit[0-9] */ # LOC_SP 0x08 /* whitespace */ # LOC_PT 0x10 /* char sometimes used as punctuation */ # LOC_AL 0x20 /* alphabetic char (ie. non-punctuation) */ # (The above list was extracted from vdk_loc.h) # # Equivalent: Ctype() function in the driver # # Note: This is an 8bit table and can only be used for locales # that specify a character set that is of type singlebyte. # # Default: English ctype table in charset 850 # # This keyword is optional. table: CHARTYPE { text: 0 "\x00\x01\x02 ..." text: 16 "\x00\x01\x02 ..." } # # Description: Specify the attributes of each character. # (The "ctype" table for those that know C.) # # Each entry is a bit field combination of the following: # LOC_UP 0x01 /* upper case letter */ # LOC_LW 0x02 /* lower case letter */ # LOC_NM 0x04 /* digit[0-9] */ # LOC_SP 0x08 /* whitespace */ # LOC_PT 0x10 /* char sometimes used as punctuation */ # LOC_AL 0x20 /* alphabetic char (ie. non-punctuation) */ # (The above list was extracted from vdk_loc.h) # # Equivalent: Ctype() function in the driver # # Note: This is a multibyte mapping table, and can be used for # any locale. However, it is much more efficient to use the table above # if you are defining an 8bit locale. # # Default: English ctype table in charset 850 # # This keyword is optional. mtable: CHARTYPE { map: "char" "bit_field" ... } # # Description: Specify an upper-case mapping for each character. # # Equivalent: You must define all of the following in the driver: # UpperCopy() # ToUpper() # strcmp() # stricmp() # strncmp() # strnicmp() # # Note: All characters must be given an upper-cased equivalent. # This is an 8bit table and can only be used for locales # that specify a character set that is of type singlebyte. # # Default: English upper-case table in charset 850 # # This keyword is optional. table: TOUPPER { text: 0 "\x00\x01\x02 ..." text: 16 "\x00\x01\x02 ..." ... text: 240 "\x00\x01\x02 ..." } # # Description: Specify an upper-case mapping for each character. # # Equivalent: You must define all of the following in the driver: # UpperCopy() # ToUpper() # strcmp() # stricmp() # strncmp() # strnicmp() # # Note: Only characters that have an upper case version need to given an # upper-case mapping. All other characters are assumed to stay the same. # This is a multibyte mapping table, and can be used for # any locale. However, it is much more efficient to use the table above # if you are defining an 8bit locale. # # Default: English upper-case table in charset 850 # # This keyword is optional. mtable: TOUPPER { map: "from_char" "to_char" ... } # # Specify a lower-case mapping for each character. # # Equivalent: You must define all of the following in the driver: # LowerCopy() # ToLower() # strcmp() # stricmp() # strncmp() # strnicmp() # # Note:All characters must be given an lower-cased equivalent. # This is an 8bit table and can only be used for locales # that specify a character set that is of type singlebyte. # # Default: English lower-case table in charset 850 # # This keyword is optional. table: TOLOWER { text: ... } # # Description: Specify an lower-case mapping for each character. # # Equivalent: You must define all of the following in the driver: # LowerCopy() # ToLower() # strcmp() # stricmp() # strncmp() # strnicmp() # # Note: Only characters that have an lower case version need to given an # lower-case mapping. All other characters are assumed to stay the same. # This is a multibyte mapping table, and can be used for # any locale. However, it is much more efficient to use the table above # if you are defining an 8bit locale. # # Default: English lower-case table in charset 850 # # This keyword is optional. mtable: TOLOWER { map: "from_char" "to_char" ... } # # Description: Specify an sort order for each character in the character set. # This is an 8bit table and can only be used for locales # that specify a character set that is of type singlebyte. # # Equivalent: SortOrderValue() and OrdinalChar() in the driver # # Note: ALL characters in the character set must be listed here. # # Default: English sort order in charset 850 # # This keyword is optional. table: SORTORDER { text: 0 "\x00\x01\x02 ..." text: 16 "\x00\x01\x02 ..." } # # Description: Specify an sort order for each character in the character set. # # Equivalent: SortOrderValue() and OrdinalChar() in the driver # # Note: ALL characters in the character set must be listed here. # This is a multibyte table, and can be used for any locale. However, # it is much more efficient to use the table above if you are # defining an 8bit locale. # # Default: English sort order in charset 850 # # This keyword is optional. sortorder: { char: "char" ... } # # Description: Specify a stem rules table. # # Equivalent: StemCopy() function in the driver. # # Note: If you are defining a multibyte locale, you must instead provide a # StemCopy() function in the locale driver. # This is an 8bit table and can only be used for locales # that specify a character set that is of type singlebyte. # # Default: English 8bit stem rules # # This keyword is optional. stemtable: /minlen = <int> { map: "orig_text" "new_text" "goto_label" [<minlen>] /position = <int> ... doubles: "goto_label" [<minlen>] /position = <int> ... label: "label" /position = <int> /minlen = <int> ... } } $$