home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
World Book - Encyclopedia of Science
/
WBScience.iso
/
DATA
/
QryConfig
/
Verity
/
sample.lng
< prev
next >
Wrap
Text File
|
1998-04-26
|
17KB
|
553 lines
#
# Sample locale file for VDK 2.0.
#
$control: 1
locale:
{
#
# Description: Inherit attributes from a different locale. All keywords
# in this lng file will overwrite any attributes from the parent
# locale. This provides a type of "subclass" mechanism for locales. For
# example, the French Canadian locale might inherit from the French
# locale, but over-ride the upper and lower case tables.
#
# Default: The locale "english" which implements basic english
# functionality in charset 850
#
# This keyword is optional
inherit: "locale_name"
#
# Description: This gives the major and minor version number of the
# the current locale. This information is stamped into collections
# when they are created so that you can tell which version of a
# locale the collection is created with.
#
# Default: The default version is 1.0
#
# These keywords are optional
Major-Version: number
Minor-Version: number
#
# Description: For numeric formatting, this tells the search engine to
# use either a comma or a period as the decimal point.
#
# Default: period
#
# This keyword is optional
decimal: "dot_or_comma"
#
# Description: Load in a library of routines to implement functional
# callbacks for this locale. Fill in a normal DDA spec for a library.
# The optional name specifies a name to use in the tokenizers section
# below. (q.v.)
#
# Example:
# driver: "DLL:french:InitFrenchLocale"
#
# This keyword is optional
driver: "dda_spec" [ "name" ]
...
#
# Description: Flags that give an indication to the engine what settings
# to use as the default for the current locale.
#
# Default: for each of the settings is "no"
#
locale-flags:
{
#
# query parser flags -- these flags affect the way the default query
# parser works
#
# Automatically turn on case-sensitive search when a capital letter
# is found in the query string?
NoAutoCase: yes/no
# Use the locale's tokenizer (either the DLL function or the lex rules)
# when tokenizing a blob of text in the query?
# No will cause the engine to use its built-in tokenizer.
QueryTok: yes/no
# Automatically generate an expanded search when a punctuation character
# is found in a search term? For example, "AT&T" is expanded to search
# for a number of possible variations (tokenizations):
#
# <Any>(<Many><Stem>`AT&T`,<Many><Phrase>(<Many><Stem>`AT`,<Many><Stem>`T`),
# <Many><Phrase>(<Many><Stem>`AT`,<Many><Stem>`&`,<Many><Stem>`T`))
NoAutoPhrase: yes/no
#
# tokenization flags -- these flags affect the way text is tokenized
# in this locale
#
# Use the engine's built-in 8bit lexer to tokenize any unprocessed
# text chunks returned by this locale's functional tokenizer. If this
# is on, the tokenize driver may defer tokenization of ASCII
# passages to the engine by returning these passages as VdkBuffer
# tokens. For example, a multibyte tokenizer can return sections of
# embedded English text as buffer tokens, letting the engine tokenize
# them with its built-in lexer.
NeedAsciiTok: yes/no
}
#
# Description: The following strings are used to identify a locale.
#
locale-def:
{
#
# Description: Name of this locale IF and only if it is different from
# the name of the subdirectory under common for this locale. This
# keyword was designed to be used with the "default" locale.
#
# Default: name of current locale dir
#
# This keyword is optional
name: "locale_name"
#
# Description: Name of this language. This should be a 2 letter string
# in a ISO 639 coding.
#
# Default: language of parent locale (or english "en" when no parent
# is explicitly specified.)
#
# This keyword is optional
langName: "language_name"
#
# Description: Name of the country. This should be a 2 letter string
# in a ISO 3166 coding.
#
# Default: country of parent locale (or "US" when no parent
# is explicitly specified.)
#
# This keyword is optional
country: "country_name"
#
# Description: Name of the character set for this locale. All strings
# that this locale will manipulate are written in this charset, and all
# tables and data in this lng file are written in this charset.
#
# Default: charset of parent locale (or 850 when no parent
# is explicitly specified.)
#
# This keyword is optional
charset: "charset_name"
#
# Description: name of the subdialect of the given language.
#
# Default: dialect of parent locale
#
# This keyword is optional
dialect: "dialect_name"
#
# Description: name of the supplier of this locale. This string
# can be any arbitrary string, including your company name.
#
# Default: supplier of parent locale
#
# This keyword is optional
supplier: "supplier_name"
}
#
# Description: Specify the tokenizers for various purposes.
#
# Equivalent: style.lex rules
#
# Default: the hard-wired 8bit lexer for English. If you do not specify
# a tokenizer for any particular purpose, then it will use the tokenizer
# you specify for the "default" purpose. If you don't specify the "default"
# purpose, then it will use the built-in hard-wired 8bit lexer for English.
#
# WARNING! Tokenization is a very sensitive process. If you use different
# tokenizers for different purposes, your highlighting is likely to be
# off. DO NOT CHANGE TOKENIZERS UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!
# A good solution is to set only one tokenizer for the "stream" purposes
# and possibly one for the "extract" purposes, as these are pretty
# independent. The best solution for most accurate highlighting is to
# use one tokenizer for all purposes. Set this using the "default"
# purpose. (q.v.)
#
# This keyword is optional.
tokenizers:
{
#
# Description: name the tokenizer to be used for indexing the
# document.
#
# WARNING! If this tokenizer is different than the "View" tokenizer,
# you will VERY likely have your highlights be off. Do not set this
# differently unless you know what you are doing. You may want to
# change the "stream" tokenizer instead, which sets the tokenizer
# for all of the "index", "view", and "dynamichl" purposes.
Index: args
#
# Description: name the tokenizer to be used for viewing the
# document using VdkDocStreamRead. (This includes viewing with
# highlights from the full-text index.)
#
# WARNING! If this tokenizer is different than the "Index" tokenizer,
# you will VERY likely have your highlights be off. Do not set this
# differently unless you know what you are doing. You may want to
# change the "stream" tokenizer instead, which sets the tokenizer
# for all of the "index", "view", and "dynamichl" purposes.
View: args
#
# Description: name the tokenizer to be used for parsing strings
# in a BooleanPlus query.
#
# WARNING! If this tokenizer is different than the "Index" tokenizer,
# then query words will be less likely to match words in the index
# and your recall rate will be less. Do not change this tokenizer
# unless you know what you are doing!
Query: args
#
# Description: name the tokenizer to be used for tokenizing a
# document that is being summarized.
Summarize: args
#
# Description: name the tokenizer to be used for tokenizing a
# document that is being used as a query-by-example document.
QBE: args
#
# Description: name the tokenizer to be used for tokenizing a
# free-text query.
FTQP: args
#
# Description: name the tokenizer to be used for tokenizing a
# document that is being highlighted dynamically.
DynamicHL: args
#
# Description: short form for naming the tokenizer used for all
# of the Summarize, FTQP, and QBE purposes
Extract: args
#
# Description: short form for naming the tokenizer used for all
# of the Index, View, and DynamicHL purposes
Stream: args
#
# Description: name the tokenizer to be used as a default if
# any of the above purposes are not explicitly set to a particular
# tokenizer
Default: args
#
# The arguments to the above can be one of the following:
#
# driver "driverName"
# - get the functional tokenizer from the named driver
#
# HWLEX
# - use the internal hard-wired lexer
#
# LEX
# - a set of style.lex type of rules is included to specify
# a new lexer
# This is an 8bit table and can only be used for locales
# that specify a character set that is of type singlebyte.
# These lex rules have the same syntax as the style.lex file rules.
# Note: If you are defining a multibyte locale, you must
# provide the above functions in the locale driver.
# If you are in a single byte locale, you can optionally write the
# above functions in the driver, but a the table below is more
# efficient.
#
# Example 1: for a knowledgable localizer. This example might give
# highlighting problems because it uses different tokenizers for
# different purposes.
#
# $control: 1
# locale:
# {
# # named drivers
# driver: "insowrap" "inso"
# driver: "veritytok" "verity"
#
# tokenizers:
# {
# Extract: driver "inso"
#
# Index: driver "verity"
# View: driver "verity"
#
# DynamicHL: LEX
# {
# define: NL "[ \t]*\n"
#
# token: WORD "[A-Za-z0-9]+" # word
# token: WORD "[0-9]+\\.[0-9]+" # word
# token: EOS "[.?!]" # end of sentence
# token: NEWLINE "{NL}" # single end-of-line
# token: EOP "{NL}({NL})+" # end of paragraph
# token: TAB "\t+" # tab
# token: WHITE " +" # whitespace
# token: PUNCT "." # all other text
# }
#
# Default: HWLEX
# }
# }
#
# Example 2: for basic tokenization. This example gives the best
# overall highlighting fidelity because the same tokenizer is
# used everywhere.
#
# $control: 1
# locale:
# {
# # named drivers
# driver: "insowrap" "inso"
#
# tokenizers:
# {
# # use this tokenizer for everything
# Default: driver "inso"
# }
# }
}
#
# Description: Specify the attributes of each character. (The "ctype"
# table for those that know C.)
#
# Each entry is a bit field combination of the following:
# LOC_UP 0x01 /* upper case letter */
# LOC_LW 0x02 /* lower case letter */
# LOC_NM 0x04 /* digit[0-9] */
# LOC_SP 0x08 /* whitespace */
# LOC_PT 0x10 /* char sometimes used as punctuation */
# LOC_AL 0x20 /* alphabetic char (ie. non-punctuation) */
# (The above list was extracted from vdk_loc.h)
#
# Equivalent: Ctype() function in the driver
#
# Note: This is an 8bit table and can only be used for locales
# that specify a character set that is of type singlebyte.
#
# Default: English ctype table in charset 850
#
# This keyword is optional.
table: CHARTYPE
{
text: 0 "\x00\x01\x02 ..."
text: 16 "\x00\x01\x02 ..."
}
#
# Description: Specify the attributes of each character.
# (The "ctype" table for those that know C.)
#
# Each entry is a bit field combination of the following:
# LOC_UP 0x01 /* upper case letter */
# LOC_LW 0x02 /* lower case letter */
# LOC_NM 0x04 /* digit[0-9] */
# LOC_SP 0x08 /* whitespace */
# LOC_PT 0x10 /* char sometimes used as punctuation */
# LOC_AL 0x20 /* alphabetic char (ie. non-punctuation) */
# (The above list was extracted from vdk_loc.h)
#
# Equivalent: Ctype() function in the driver
#
# Note: This is a multibyte mapping table, and can be used for
# any locale. However, it is much more efficient to use the table above
# if you are defining an 8bit locale.
#
# Default: English ctype table in charset 850
#
# This keyword is optional.
mtable: CHARTYPE
{
map: "char" "bit_field"
...
}
#
# Description: Specify an upper-case mapping for each character.
#
# Equivalent: You must define all of the following in the driver:
# UpperCopy()
# ToUpper()
# strcmp()
# stricmp()
# strncmp()
# strnicmp()
#
# Note: All characters must be given an upper-cased equivalent.
# This is an 8bit table and can only be used for locales
# that specify a character set that is of type singlebyte.
#
# Default: English upper-case table in charset 850
#
# This keyword is optional.
table: TOUPPER
{
text: 0 "\x00\x01\x02 ..."
text: 16 "\x00\x01\x02 ..."
...
text: 240 "\x00\x01\x02 ..."
}
#
# Description: Specify an upper-case mapping for each character.
#
# Equivalent: You must define all of the following in the driver:
# UpperCopy()
# ToUpper()
# strcmp()
# stricmp()
# strncmp()
# strnicmp()
#
# Note: Only characters that have an upper case version need to given an
# upper-case mapping. All other characters are assumed to stay the same.
# This is a multibyte mapping table, and can be used for
# any locale. However, it is much more efficient to use the table above
# if you are defining an 8bit locale.
#
# Default: English upper-case table in charset 850
#
# This keyword is optional.
mtable: TOUPPER
{
map: "from_char" "to_char"
...
}
#
# Specify a lower-case mapping for each character.
#
# Equivalent: You must define all of the following in the driver:
# LowerCopy()
# ToLower()
# strcmp()
# stricmp()
# strncmp()
# strnicmp()
#
# Note:All characters must be given an lower-cased equivalent.
# This is an 8bit table and can only be used for locales
# that specify a character set that is of type singlebyte.
#
# Default: English lower-case table in charset 850
#
# This keyword is optional.
table: TOLOWER
{
text:
...
}
#
# Description: Specify an lower-case mapping for each character.
#
# Equivalent: You must define all of the following in the driver:
# LowerCopy()
# ToLower()
# strcmp()
# stricmp()
# strncmp()
# strnicmp()
#
# Note: Only characters that have an lower case version need to given an
# lower-case mapping. All other characters are assumed to stay the same.
# This is a multibyte mapping table, and can be used for
# any locale. However, it is much more efficient to use the table above
# if you are defining an 8bit locale.
#
# Default: English lower-case table in charset 850
#
# This keyword is optional.
mtable: TOLOWER
{
map: "from_char" "to_char"
...
}
#
# Description: Specify an sort order for each character in the character set.
# This is an 8bit table and can only be used for locales
# that specify a character set that is of type singlebyte.
#
# Equivalent: SortOrderValue() and OrdinalChar() in the driver
#
# Note: ALL characters in the character set must be listed here.
#
# Default: English sort order in charset 850
#
# This keyword is optional.
table: SORTORDER
{
text: 0 "\x00\x01\x02 ..."
text: 16 "\x00\x01\x02 ..."
}
#
# Description: Specify an sort order for each character in the character set.
#
# Equivalent: SortOrderValue() and OrdinalChar() in the driver
#
# Note: ALL characters in the character set must be listed here.
# This is a multibyte table, and can be used for any locale. However,
# it is much more efficient to use the table above if you are
# defining an 8bit locale.
#
# Default: English sort order in charset 850
#
# This keyword is optional.
sortorder:
{
char: "char"
...
}
#
# Description: Specify a stem rules table.
#
# Equivalent: StemCopy() function in the driver.
#
# Note: If you are defining a multibyte locale, you must instead provide a
# StemCopy() function in the locale driver.
# This is an 8bit table and can only be used for locales
# that specify a character set that is of type singlebyte.
#
# Default: English 8bit stem rules
#
# This keyword is optional.
stemtable:
/minlen = <int>
{
map: "orig_text" "new_text" "goto_label" [<minlen>]
/position = <int>
...
doubles: "goto_label" [<minlen>]
/position = <int>
...
label: "label"
/position = <int>
/minlen = <int>
...
}
}
$$