home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Source Code 1992 March
/
Source_Code_CD-ROM_Walnut_Creek_March_1992.iso
/
usenet
/
altsrcs
/
3
/
3580
< prev
next >
Wrap
Text File
|
1991-07-02
|
20KB
|
608 lines
Newsgroups: alt.sources
From: goer@ellis.uchicago.edu (Richard L. Goerwitz)
Subject: kjv browser, part 4 of 11
Message-ID: <1991Jul3.065038.28067@midway.uchicago.edu>
Date: Wed, 3 Jul 1991 06:50:38 GMT
---- Cut Here and feed the following to sh ----
#!/bin/sh
# this is bibleref.04 (part 4 of a multipart archive)
# do not concatenate these parts, unpack them in order with /bin/sh
# file binsrch.icn continued
#
if test ! -r _shar_seq_.tmp; then
echo 'Please unpack part 1 first!'
exit 1
fi
(read Scheck
if test "$Scheck" != 4; then
echo Please unpack part "$Scheck" next!
exit 1
else
exit 0
fi
) < _shar_seq_.tmp || exit 1
if test ! -f _shar_wnt_.tmp; then
echo 'x - still skipping binsrch.icn'
else
echo 'x - continuing file binsrch.icn'
sed 's/^X//' << 'SHAR_EOF' >> 'binsrch.icn' &&
X#
X# This file contains a single procedure, binary_index_search(str,
X# filename), which goes through a file called filename looking for a
X# line beginning with str. Note well that binary_index_search()
X# assumes lines in filename will contain more than str. Str must
X# occupy the first part of the line, separated from the remainder by
X# a tab.
X#
X############################################################################
X#
X# Links: none
X#
X# See also: retrieve.icn, makeind.icn
X#
X############################################################################
X
X
Xprocedure binary_index_search(entry, index_filename)
X
X local in_index, bottom, top, loc, incr, firstpart, offset
X
X in_index := open(index_filename) |
X abort("binary_index_search","can't open "||index_filename,18)
X
X bottom := 1
X seek(in_index, 0)
X top := where(in_index)
X
X # If bottom gets bigger than top, there's no such entry.
X until bottom > top do {
X
X loc := (top+bottom) / 2
X seek(in_index, loc)
X
X # Move past next newline. If at bottom, break.
X incr := 1
X until reads(in_index) == "\n" do
X incr +:= 1
X if loc+incr = bottom then {
X top := loc-1
X next
X }
X
X # Check to see if the current line starts with entry (arg 1).
X read(in_index) ? {
X
X # .IND file line format is entry\tbitmap-file-offset
X if entry == (firstpart := tab(find("\t"))) then {
X # return offset
X return (move(1), tab(0))
X }
X # Ah, this is what all binary searches do.
X else {
X if entry << firstpart
X then top := loc-1
X else bottom := loc + incr + *&subject
X }
X }
X }
X
Xend
SHAR_EOF
echo 'File binsrch.icn is complete' &&
true || echo 'restore of binsrch.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= bmp2text.icn ==============
if test -f 'bmp2text.icn' -a X"$1" != X"-c"; then
echo 'x - skipping bmp2text.icn (File already exists)'
rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting bmp2text.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'bmp2text.icn' &&
X############################################################################
X#
X# Name: bmp2text.icn
X#
X# Title: convert a bitmap to a text-chunk
X#
X# Author: Richard L. Goerwitz
X#
X# Version: 1.12
X#
X############################################################################
X#
X# This file contains bitmap_2_text(bitmap, filename). Recall that
X# bitmaps are just a series of fixed-length bitfields used to mark
X# divisions within a text. The procedure retrieve() finds words in
X# an index file, and returns a list of these bitmaps, which point to
X# divisions within the original text file - divisions within which a
X# given indexed word found by retrieve() occurs. The procedure
X# bitmap_2_filename() simply takes a given bitmap and finds the text
X# with which it is associated in the full text file.
X#
X# Note that bitmap_2_text() does not seek directly to the correct
X# location within "filename" (arg 2). It first breaks down the
X# bitmap into a less precise form, looks up the location of that
X# form, seeks up to its location, and then bumbles along until it
X# reaches the chunk of text corresponding to the full "bitmap" (arg
X# 1). The reason bitmap_2_text() does this is that makeind (the
X# indexing routine which creates data files for retrieve() and
X# bitmap_2_text()) does not store the offset within filename for
X# every bitmap. It just saves the locations of major blocks. This
X# is basically just a space-saving device. It would eat up too much
X# memory (both disk and core) to keep a list of every offset for
X# every chunk of text marked out by a bitmap in filename.
X#
X# Note also that, although retrieve() returns a list of bitmaps, bit-
X# map_2_text(bitmap, filename) expects a single bitmap as its first
X# argument. It is better that text be retrieved as needed, one chunk
X# at a time, and not stuffed en masse into core memory as soon as it
X# is retrieve()'d.
X#
X############################################################################
X#
X# Links: ./indexutl.icn, ./initfile.icn
X#
X# See also: retrieve.icn, makeind.icn
X#
X############################################################################
X
X# Declared in indexutl.icn.
X# record is(FS, s_len, len, no, is_case_sensitive)
X# global IS
X
X# Declared in initfile.icn.
X# global filestats
X# record Fs(ind_filename, bmp_filename, lim_filename, IS, ofs_table)
X
Xprocedure bitmap_2_text(bitmap, filename)
X
X local intext, cut_down_bitmap, upto_field, offset, line, value,
X base_value_mask, base_value, location
X static t
X # global filestats, IS
X initial t := table()
X
X # Check for sloppy programming.
X /filename & abort("bitmap_2_text","you called me without a filename",29)
X
X # If necessary, initialize stats for the current file.
X #
X if /filestats | /filestats[filename]
X then initfile(filename) # see initfile.icn
X # Reset IS to current file.
X IS := filestats[filename].IS
X
X # open full text file for reading
X intext := open(filename) |
X abort("bitmap_2_text", "can't open "||filename, 26)
X
X # Determine offset to seek to by using the bitmap->offset table
X # for the current file (arg 2). The name of the bitmap_offset
X # table is stored in filestats[filename].ofs_table.
X #
X upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
X cut_down_bitmap := ishift(bitmap, -(IS.no - upto_field) * IS.len)
X offset := \filestats[filename].ofs_table[cut_down_bitmap] | fail
X
X # Seek to offset, and begin looking for the string equiv. of
X # bitmap (arg 1).
X #
X seek(intext, offset) |
X abort("bitmap_2_text","can't seek to offset "||offset, 27)
X
X #
X # This works a lot like the routine in gettext.icn (another related
X # retrieval package). Note that bitmaps in "filename" (arg 2) are on
X # their own lines, preceded by a double colon.
X #
X # First figure out how to tell if we've gone too far. Basically,
X # mask out the lower bits, and record the value of the upper bits.
X # Some fooling around is necessary because bitmaps may use large
X # ints, making it impossible to use icom() in a naive manner.
X # If the upper bits of the bitmaps being read change, then we've
X # gone too far.
X #
X base_value_mask := icom(2^((IS.no - upto_field) * IS.len)- 1)
X base_value := iand(bitmap, base_value_mask)
X
X while line := read(intext) do {
X line ? {
X if ="::" then {
X location := digits_2_bitmap(tab(0)) # in indexutl.icn
X if bitmap = location
X then {
X # Collect all text upto the next colon+colon-initial
X # line (::) or EOF.
X value := ""
X while line := read(intext) do {
X match("::",line) & break
X value ||:= line || "\n"
X }
X # Note that a key with an empty value returns an
X # empty string.
X close(intext)
X return trim(value, '\n')
X }
X else {
X if base_value ~= iand(location, base_value_mask)
X then fail
X }
X }
X }
X }
X
X # we should have returned by now
X close(intext)
X fail
X
Xend
X
SHAR_EOF
true || echo 'restore of bmp2text.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= initfile.icn ==============
if test -f 'initfile.icn' -a X"$1" != X"-c"; then
echo 'x - skipping initfile.icn (File already exists)'
rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting initfile.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'initfile.icn' &&
X############################################################################
X#
X# Name: initfile.icn
X#
X# Title: initialize entry for file in filestats table
X#
X# Author: Richard L. Goerwitz
X#
X# Version: 1.9
X#
X############################################################################
X#
X# This file contains initfile(filename), which creates a set of stats
X# for the indexed database contained in filename. Uses several global
X# structures, primarily for speed. Beware.
X#
X############################################################################
X#
X# See also: retrieve.icn, bmp2text.icn, retrops.icn
X#
X############################################################################
X
X# Used to store stats for each filename.
Xrecord Fs(ind_filename, bmp_filename, lim_filename, IS, ofs_table)
X
X# IS is declared in indexutl.icn.
X# global IS
X
Xglobal filestats
X
Xprocedure initfile(filename)
X
X # Messy procedure which creates and stores the names of several
X # files that will be repeatedly used with "filename." Reads in
X # the stats for filename from that file's .IS file. Also reads in
X # the bitmap->offset (.OFS file) table, and puts it into
X # filestats[filename].ofs_table for later (re-)use.
X
X local IS_filename, in_IS, upto_field, stored_bitmap_length,
X ofs_filename, intext, cut_down_bitmap, block_size, offset
X # global filestats
X initial {
X filestats := table()
X # OS-specific parameters are initialized here.
X initialize_os_params() # in indexutl.icn
X }
X
X # Check for sloppy programming. Did we do this one already??
X if not (/filestats[filename] := Fs(,,,,table())) then fail
X
X filestats[filename].ind_filename :=
X dir_name(filename)||create_fname(filename, "IND")
X filestats[filename].bmp_filename :=
X dir_name(filename)||create_fname(filename, "BMP")
X filestats[filename].lim_filename :=
X dir_name(filename)||create_fname(filename, "LIM")
X
X # Decode stored IS record for filename.
X IS_filename := dir_name(filename)||create_fname(filename, "IS")
X in_IS := open(IS_filename) | abort("bitmap_2_text",
X "Can't open "||IS_filename||". Did you forget to index?", 24)
X filestats[filename].IS := decode(!in_IS)
X close(in_IS)
X
X # Having decoded IS, we can now determine the length of the cut-
X # down bitmaps stored in the .OFS file for filename.
X upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
X stored_bitmap_length :=
X ((filestats[filename].IS.len * upto_field) <= seq(0,8))
X
X # open .OFS file
X ofs_filename := dir_name(filename)||create_fname(filename, "OFS")
X intext := open(ofs_filename) |
X abort("bitmap_2_text", "can't open "||ofs_filename, 23)
X
X # read in blocks from .OFS file, breaking them into their
X # constituent parts
X while block_size := read_int(intext, 8) * 8 do {
X cut_down_bitmap := read_int(intext, stored_bitmap_length)
X offset := read_int(intext, block_size - stored_bitmap_length)
X insert(filestats[filename].ofs_table, cut_down_bitmap, offset)
X }
X
X close(intext)
X # For lack of a better thing to return, return the size of
X # the internal bitmap->offset table for filename.
X return *filestats[filename].ofs_table
X
Xend
SHAR_EOF
true || echo 'restore of initfile.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= retrieve.icn ==============
if test -f 'retrieve.icn' -a X"$1" != X"-c"; then
echo 'x - skipping retrieve.icn (File already exists)'
rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting retrieve.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'retrieve.icn' &&
X############################################################################
X#
X# Name: retrieve.icn
X#
X# Title: retrieve locations of words in database file
X#
X# Author: Richard L. Goerwitz
X#
X# Version: 1.13
X#
X############################################################################
X#
X# Retrieve(pattern, filename) retrieves all locations containing
X# words matching pattern (arg1) in filename (arg2), placing them in a
X# list. "Locations" are integer-coded pointers to places in filename
X# where corresponding text is located. To actually retrieve that
X# block of text, you must call bitmap_2_text(location, filename).
X# Retrieve() only gathers up a list of locations in filename
X# containing words which match pattern.
X#
X# The reason retrieve() doesn't do the logical thing - namely, to
X# "retrieve" text itself - is that doing so might use a *lot* of
X# memory. It is far more economical to retrieve text only when a
X# given chunk is requested via bitmap_2_text().
X#
X# The format for filename must conform to a simple, but strict, set
X# of guidelines. Basically, it must interleave a series of keys
X# (so-called "bitmaps") with actual text:
X#
X# ::001:001:001
X# This is text.
X# ::001:001:002
X# This is more text.
X#
X# The lines beginning with :: (a double colon) are the keys. These
X# translate into an integer dividable internally into (in this case)
X# three bit-fields of length 10 (enough to handle 999:999:999), which
X# serve as a location markers for the text that goes with them. See
X# makeind.icn for a precise instructions on how to construct and index
X# files.
X#
X# Note: Patterns must match words in their entirety. For instance,
X# retrieve("dog",filename) would only retrieve exact matches for the
X# word "dog" in filename. To catch, say, "doggie" as well, it would
X# be necessary to call retrieve with a regular expression that
X# matched both dog and doggie (e.g. retrieve("dog.*",filename)).
X#
X############################################################################
X#
X# Links: codeobj.icn, ./indexutl.icn, ./binsrch.icn, ./initfile.icn
X# ./findre.icn
X#
X# See also: makeind.icn, bmp2text.icn
X#
X############################################################################
X
Xlink codeobj
X
X# The following globals contain stats for current file (here, arg2).
X# global filestats # declared in initfile.icn
X# global IS # declared in indexutl.icn
X
Xprocedure retrieve(pattern, filename, inverse)
X
X local bitmap_list, bmp_file, in_egrep, intext, cmd, offset, line
X static is_UNIX, egrep_filename
X initial {
X if is_UNIX := find("UNIX",&features) then
X # If egrep is available, use it. It's fast.
X egrep_filename := "egrep"
X # egrep_filename := "/usr/local/bin/gnuegrep"
X }
X
X # Check for sloppy programming.
X /filename & abort("retrieve","you called me without a filename",22)
X
X # Initialize important variables.
X #
X if /filestats | /filestats[filename]
X then initfile(filename) # see initfile.icn
X bitmap_list := list() # list will contain locations of hits
X IS := filestats[filename].IS # re-initialize IS for current file
X if /IS.is_case_sensitive then
X pattern := map(pattern)
X
X # Open bitmap file.
X #
X bmp_file := open(filestats[filename].bmp_filename) |
X abort("retrieve","can't open "||filestats[filename].bmp_filename, 29)
X
X # Search index.
X #
X if are_metas(pattern) then {
X # NB: are_metas() can be found in indexutl.icn
X
X # If there are metacharacters in pattern, do a regexp pattern match.
X # The .IND file goes: line ::= key \t other-stuff.
X pattern := "^(" || pattern || ")\t"
X
X # If UNIX, then use egrep to search index.
X #
X if \is_UNIX then {
X
X # Set up command line to be passed to /bin/sh. If
X # inverse is nonnull, invert the sense of the search
X # (i.e. egrep -v).
X if \inverse then {
X cmd := egrep_filename || "-v '" || pattern ||
X "' " || filestats[filename].ind_filename
X } else {
X cmd := egrep_filename || " '" || pattern ||
X "' " || filestats[filename].ind_filename
X }
X # open pipe
X in_egrep := open(cmd, "rp") |
X abort("retrieve","can't open pipe from\n\t"||cmd, 20)
X # grep .IND index file
X every line := !in_egrep do {
X line ? (tab(find("\t")+1), offset := integer(tab(0)))
X bitmap_list |||:= retrieve_bitmaps(offset, bmp_file)
X }
X every close(bmp_file | in_egrep)
X
X # ...otherwise (i.e. if not UNIX) use findre() instead of egrep
X #
X } else {
X
X # Probably MS-DOS or something else. SLOW, SLOW!
X intext := open(filestats[filename].ind_filename) |
X abort("retrieve","can't open index file", 21)
X # grep .IND file
X if \inverse then {
X # if inverse is nonnull, invert the sense of the search
X every line := !intext do {
X line ? {
X if not findre(pattern) & tab(find("\t")+1) then {
X bitmap_list |||:=
X retrieve_bitmaps(integer(tab(0)), bmp_file)
X }
X }
X }
X } else {
X # inverse is null; don't invert the sense of the search
X every line := !intext do {
X line ? {
X if findre(pattern) & tab(find("\t")+1) then {
X bitmap_list |||:=
X retrieve_bitmaps(integer(tab(0)), bmp_file)
X }
X }
X }
X }
X every close(bmp_file | intext)
X
X }
X
X # If *not* are_metas(pattern), then do a binary search of index.
X # No need to worry about is_UNIX, egrep, findre(), etc.
X #
X } else {
X
X # If inverse is nonnull, invert the sense of the search
X # (binary_index_search() may be found in binsrch.icn).
X if \inverse then {
X if not (offset :=
X binary_index_search(pattern, filestats[filename].ind_filename))
X then bitmap_list |||:= retrieve_bitmaps(offset, bmp_file)
X } else {
X if offset :=
X binary_index_search(pattern, filestats[filename].ind_filename)
X then bitmap_list |||:= retrieve_bitmaps(offset, bmp_file)
X }
X close(bmp_file)
X }
X
X # We're done. See if there were any hits.
X #
X if *bitmap_list > 0
X then return bitmap_list
X else fail
X
Xend
X
X
X
Xprocedure retrieve_bitmaps(offset, f)
X
X local bitmap_list, bitmap_length, i
X # global IS # contains stats for current file
X
X seek(f, offset)
X bitmap_list := list()
X bitmap_length := ((IS.len * IS.no) <= seq(0,8))
X
X every i := 1 to read_int(f, 16) do
X put(bitmap_list, read_int(f, bitmap_length))
X
X return bitmap_list
X
Xend
SHAR_EOF
true || echo 'restore of retrieve.icn failed'
rm -f _shar_wnt_.tmp
fi
# ============= indexutl.icn ==============
if test -f 'indexutl.icn' -a X"$1" != X"-c"; then
echo 'x - skipping indexutl.icn (File already exists)'
rm -f _shar_wnt_.tmp
else
> _shar_wnt_.tmp
echo 'x - extracting indexutl.icn (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'indexutl.icn' &&
X############################################################################
X#
X# Name: indexutl.icn
X#
X# Title: indexing utilities
X#
X# Author: Richard L. Goerwitz
X#
X# Version: 1.19
X#
X############################################################################
X#
X# This file contains base_name(), dir_name(), get_index_fname(),
X# stripchars(), abort(), and gettokens().
X#
X# base_name(s), dir_name(s) - like the Unix system commands
X# create_fname(fname,ext) - get a new filename based on fname + ext
X# stripchars(s,c) - strip chars c from string s
X# abort(proc,msg,ecode) - abort procedure proc with exit code ecode
X# write_int(f, int, size) - breaks int into 8-bit chunks & writes to f
X# read_int(f, int, size) - like write_int, only constructs int from f
X# are_metas(pattern) - succeeds if pattern has egrep-style metas
X# digits_2_bitmap(s) - converts string 01:13:94 to an int-bitmap
X#
X############################################################################
X#
X# Links: ./findre.icn, radcon.icn, bincvt.icn
X#
X# See also: retrieve.icn, retrops.icn, bmp2text.icn, makeind.icn
SHAR_EOF
true || echo 'restore of indexutl.icn failed'
fi
echo 'End of part 4'
echo 'File indexutl.icn is continued in part 5'
echo 5 > _shar_seq_.tmp
exit 0
--
-Richard L. Goerwitz goer%sophist@uchicago.bitnet
goer@sophist.uchicago.edu rutgers!oddjob!gide!sophist!goer