home *** CD-ROM | disk | FTP | other *** search
Text File | 1991-10-19 | 32.2 KB | 1,059 lines |
- Newsgroups: comp.sources.misc
- From: goer@midway.uchicago.edu (Richard L. Goerwitz)
- Subject: v23i070: quranref - Holy Qur'an word and passage based retrievals, Part04/08
- Message-ID: <1991Oct19.022300.12923@sparky.imd.sterling.com>
- X-Md4-Signature: f2be581ed2d06ec701e8e7c057e32ebc
- Date: Sat, 19 Oct 1991 02:23:00 GMT
- Approved: kent@sparky.imd.sterling.com
-
- Submitted-by: goer@midway.uchicago.edu (Richard L. Goerwitz)
- Posting-number: Volume 23, Issue 70
- Archive-name: quranref/part04
- Environment: Icon
-
- ---- Cut Here and feed the following to sh ----
- #!/bin/sh
- # this is quranref.04 (part 4 of a multipart archive)
- # do not concatenate these parts, unpack them in order with /bin/sh
- # file huffcode.icn continued
- #
- if test ! -r _shar_seq_.tmp; then
- echo 'Please unpack part 1 first!'
- exit 1
- fi
- (read Scheck
- if test "$Scheck" != 4; then
- echo Please unpack part "$Scheck" next!
- exit 1
- else
- exit 0
- fi
- ) < _shar_seq_.tmp || exit 1
- if test ! -f _shar_wnt_.tmp; then
- echo 'x - still skipping huffcode.icn'
- else
- echo 'x - continuing file huffcode.icn'
- sed 's/^X//' << 'SHAR_EOF' >> 'huffcode.icn' &&
- X # 1. That one element remaining (h[1]) is your Huffman tree.
- X #
- X # Based loosely on Sedgewick (2nd ed., 1988), p. 328-9.
- X #
- X local frst, scnd, count
- X
- X until *h = 1 do {
- X
- X h[1] :=: h[*h] # Reverse first and last elements.
- X frst := pull(h) # Pop last elem off & save it.
- X resettle_heap(h, 1) # Resettle the heap.
- X scnd := !h # Save (but don't clobber) top element.
- X
- X count := frst.n + scnd.n
- X frst := { if *frst = 2 then frst.c else _N(frst.l, frst.r) }
- X scnd := { if *scnd = 2 then scnd.c else _N(scnd.l, scnd.r) }
- X
- X h[1] := node(frst, scnd, count) # Create new node().
- X resettle_heap(h, 1) # Resettle once again.
- X }
- X
- X # H is no longer a stack. It's single element - the root of a
- X # Huffman tree made up of node()s and leaf()s. Put the l and r
- X # fields of that element into an _N record, and return the new
- X # record.
- X return _N(h[1].l, h[1].r)
- X
- Xend
- X
- X
- Xprocedure hash_huffcodes(tr)
- X
- X #
- X # Hash Huffman codes. Tr (arg 1) is a Huffman tree created by
- X # heap_2_huffman_tree(heap). Output is a table, with the keys
- X # representing characters, and the values being records of type
- X # hcode(i,len), where i is the Huffcode (an integer) and len is
- X # the number of bits it occupies.
- X #
- X local code, huffman_table
- X
- X huffman_table := table()
- X every code := build_codes(tr) do
- X insert(huffman_table, code.c, code)
- X return huffman_table
- X
- Xend
- X
- X
- Xprocedure build_codes(tr, i, len)
- X
- X #
- X # Decompose Huffman tree tr into hcode() records which contain
- X # 3 fields: c (the character encoded), i (its integer code),
- X # and len (the number of bytes the integer code occupies). Sus-
- X # pend one such record for each character encoded in tree tr.
- X #
- X
- X if type(tr) == "string" then
- X return hcode(tr, i, len)
- X else {
- X (/len := 1) | (len +:= 1)
- X (/i := 0) | (i *:= 2)
- X suspend build_codes(tr.l, i, len)
- X i +:= 1
- X suspend build_codes(tr.r, i, len)
- X }
- X
- Xend
- X
- X
- Xprocedure block_encode(s, huffman_table)
- X
- X #
- X # Write to file f string s encoded using huffman_table (a table having
- X # chars as keys and huffman codes as values).
- X #
- X # Create huffman_table as follows (char_tbl is a table, with chars as
- X # keys and frequencies as values):
- X #
- X # heap := heap_init(char_tbl)
- X # hufftree := heap_2_huffman_tree(heap)
- X # huffman_table := hash_huffcodes(hufftree)
- X #
- X # Store the tree, hufftree. Pass the huffman table to block_encode as
- X # its second argument.
- X
- X local s2, size, hcode_4_chr, chr
- X
- X *s > 2r1111111111111111 &
- X stop("write_string: too many characters in s")
- X
- X s2 := "" # initialize size string
- X outbits() # just in case
- X every s2 ||:= outbits(*s, 16) # block size = 2 bytes
- X
- X s ? {
- X while chr := move(1) do {
- X hcode_4_chr := \huffman_table[chr] |
- X stop("write_string: unexpected char, ",image(chr))
- X every s2 ||:= outbits(hcode_4_chr.i, hcode_4_chr.len)
- X }
- X s2 ||:= outbits()
- X }
- X
- X return s2
- X
- Xend
- X
- X
- Xprocedure block_decode(f, huff_tree)
- X
- X # Undo what block_encode does.
- X
- X local how_many, s2, E, chr, bit
- X
- X s2 := ""
- X
- X # The first two bytes record how many characters the original
- X # text had in it. If the read fails, it means that the file
- X # system filled up while making the index, and the bitmaps now
- X # can't be located in f.
- X how_many := ishift(ord(reads(f)), 8) + ord(reads(f)) |
- X stop("block_decode: failure reading ",image(f))
- X # If the original text was blank (zero characters), then return
- X # an empty string.
- X if how_many = 0 then { return "" }
- X
- X reads(f, how_many) ? {
- X
- X # Otherwise, set E = to the top node of the Huffman tree, and
- X # begin decoding.
- X E := huff_tree
- X while chr := move(1) do {
- X every bit := iand(1, ishift(ord(chr), -7 to 0)) do {
- X E := { if bit = 0 then E.l else E.r }
- X if s2 ||:= string(E) then {
- X if *s2 = how_many
- X then return s2
- X else E := huff_tree
- X }
- X }
- X }
- X }
- X
- X # If we get to here, something is quite amiss!
- X stop("read_string: bad character count")
- X
- Xend
- SHAR_EOF
- echo 'File huffcode.icn is complete' &&
- true || echo 'restore of huffcode.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= binsrch.icn ==============
- if test -f 'binsrch.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping binsrch.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting binsrch.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'binsrch.icn' &&
- X############################################################################
- X#
- X# Name: binsrch.icn
- X#
- X# Title: general-purpose binary index search
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.4
- X#
- X############################################################################
- X#
- X# This file contains a single procedure, binary_index_search(str,
- X# filename), which goes through a file called filename looking for a
- X# line beginning with str. Note well that binary_index_search()
- X# assumes lines in filename will contain more than str. Str must
- X# occupy the first part of the line, separated from the remainder by
- X# a tab.
- X#
- X############################################################################
- X#
- X# Links: none
- X#
- X# See also: retrieve.icn, makeind.icn
- X#
- X############################################################################
- X
- X
- Xprocedure binary_index_search(entry, index_filename)
- X
- X local in_index, bottom, top, loc, incr, firstpart, offset
- X
- X in_index := open(index_filename) |
- X abort("binary_index_search","can't open "||index_filename,18)
- X
- X bottom := 1
- X seek(in_index, 0)
- X top := where(in_index)
- X
- X # If bottom gets bigger than top, there's no such entry.
- X until bottom > top do {
- X
- X loc := (top+bottom) / 2
- X seek(in_index, loc)
- X
- X # Move past next newline. If at bottom, break.
- X incr := 1
- X until reads(in_index) == "\n" do
- X incr +:= 1
- X if loc+incr = bottom then {
- X top := loc-1
- X next
- X }
- X
- X # Check to see if the current line starts with entry (arg 1).
- X read(in_index) ? {
- X
- X # .IND file line format is entry\tbitmap-file-offset
- X if entry == (firstpart := tab(find("\t"))) then {
- X # return offset
- X return (move(1), tab(0))
- X }
- X # Ah, this is what all binary searches do.
- X else {
- X if entry << firstpart
- X then top := loc-1
- X else bottom := loc + incr + *&subject
- X }
- X }
- X }
- X
- Xend
- SHAR_EOF
- true || echo 'restore of binsrch.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= bmp2text.icn ==============
- if test -f 'bmp2text.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping bmp2text.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting bmp2text.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'bmp2text.icn' &&
- X############################################################################
- X#
- X# Name: bmp2text.icn
- X#
- X# Title: convert a bitmap to a text-chunk
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 2.6
- X#
- X############################################################################
- X#
- X# This file contains bitmap_2_text(bitmap, filename). Recall that
- X# bitmaps are just a series of fixed-length bitfields used to mark
- X# divisions within a text. The procedure retrieve() locates words in
- X# an index file, and returns a list of these bitmaps, which point to
- X# divisions within the original text file - divisions within which a
- X# given indexed word found by retrieve() occurs. The procedure
- X# bitmap_2_filename() simply takes a given bitmap and finds the text
- X# with which it is associated in the main text file.
- X#
- X# Note that bitmap_2_text() does not seek directly to the correct
- X# location within "filename" (arg 2). It first breaks down the
- X# bitmap into a less precise form via an offset table (read in via
- X# the .OFS file), looks up the precise location of the bitmap in the
- X# .UNT file, and then finally seeks up to that location in the main
- X# text file, decodes the text it finds at that location, and then
- X# returns the decoded section as a string. The reason
- X# bitmap_2_text() does this is that makeind (the indexing routine
- X# which creates data files for retrieve() and bitmap_2_text()) does
- X# not store the offset within the main text for every bitmap. It
- X# just saves the locations of major blocks in the .OFS file, then
- X# keeps a full list on disk in the .UNT file. This is basically just
- X# a space-saving device. It would eat up too much core memory to
- X# keep a list of every offset for every chunk of text marked out by a
- X# bitmap in filename.
- X#
- X# Note also that, although retrieve() returns a list of bitmaps, bit-
- X# map_2_text(bitmap, filename) expects a single bitmap as its first
- X# argument. It is better that text be retrieved as needed, one chunk
- X# at a time, and not stuffed en masse into core memory as soon as it
- X# is retrieve()'d.
- X#
- X############################################################################
- X#
- X# Links: ./indexutl.icn, ./initfile.icn
- X#
- X# See also: retrieve.icn, makeind.icn
- X#
- X############################################################################
- X
- X# Declared in indexutl.icn.
- X# record is(FS, s_len, len, no, is_case_sensitive, hufftree)
- X# global IS
- X
- X# Declared in initfile.icn.
- X# global filestats
- X# record Fs(ind_filename, bmp_filename, lim_filename, unt_filename,
- X# IS, ofs_table)
- X
- Xprocedure bitmap_2_text(bitmap, filename)
- X
- X local cut_down_bitmap, upto_field, offset, line, base_value_mask,
- X base_value, location
- X static t, old_main_filename, in_main_file, in_unt_file
- X # global filestats, IS
- X initial {
- X t := table()
- X old_main_filename := ""
- X }
- X
- X # Check for sloppy programming.
- X /filename & abort("bitmap_2_text","you called me without a filename",29)
- X
- X if old_main_filename ~==:= filename then {
- X #
- X # If necessary, initialize stats for the current file.
- X #
- X if /filestats | /filestats[filename]
- X then initfile(filename) # see initfile.icn
- X # open full text file for reading
- X every close(\in_main_file | \in_unt_file)
- X in_main_file := open(filename) |
- X abort("bitmap_2_text", "can't open "||filename, 26)
- X in_unt_file := open(filestats[filename].unt_filename) |
- X abort("bitmap_2_text", "can't open .UNT file for "||filename, 27)
- X }
- X
- X # Reset IS to current file.
- X IS := filestats[filename].IS
- X
- X # Determine offset to seek to by using the bitmap->offset table
- X # for the current file (arg 2). The name of the bitmap_offset
- X # table is stored in filestats[filename].ofs_table.
- X #
- X upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
- X cut_down_bitmap := ishift(bitmap, -(IS.no - upto_field) * IS.len)
- X offset := \filestats[filename].ofs_table[cut_down_bitmap] | fail
- X
- X # Seek to offset, and begin looking for the string equiv. of
- X # bitmap (arg 1).
- X #
- X seek(in_unt_file, offset) |
- X abort("bitmap_2_text","can't seek to offset "||offset, 27)
- X
- X #
- X # Find the major text division for bitmap using the offset table
- X # (in filestats[filename].ofs_table), look up its precise loca-
- X # tion in the .UNT file, then seek to that location in the main
- X # text file and decode whatever text is encoded at that location.
- X #
- X
- X #
- X # First figure out how to tell if we've gone too far. Basically,
- X # mask out the lower bits, and record the value of the upper bits.
- X # Some fooling around is necessary because bitmaps may use large
- X # ints, making it impossible to use icom() in a naive manner.
- X # If the upper bits of the bitmaps being read change, then we've
- X # gone too far.
- X #
- X base_value_mask := icom(2^((IS.no - upto_field) * IS.len)- 1)
- X base_value := iand(bitmap, base_value_mask)
- X
- X while line := read(in_unt_file) do {
- X line ? {
- X location := digits_2_bitmap(tab(find("\t"))) # in indexutl.icn
- X if bitmap = location then {
- X move(1) # move past tab character
- X # block_decode() is in huffcode.icn; decodes the encoded
- X # verse and returns the result (should be an ASCII string)
- X seek(in_main_file, offset := integer(tab(0))) |
- X abort("bitmap_2_text","unable to seek to "||offset,28)
- X return block_decode(in_main_file, IS.hufftree)
- X }
- X else {
- X if base_value ~= iand(location, base_value_mask)
- X then fail
- X }
- X }
- X }
- X
- X # we should have returned by now
- X fail
- X
- Xend
- SHAR_EOF
- true || echo 'restore of bmp2text.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= initfile.icn ==============
- if test -f 'initfile.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping initfile.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting initfile.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'initfile.icn' &&
- X############################################################################
- X#
- X# Name: initfile.icn
- X#
- X# Title: initialize entry for file in filestats table
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 2.2
- X#
- X############################################################################
- X#
- X# This file contains initfile(filename), which creates a set of stats
- X# for the indexed database contained in filename. Uses several global
- X# structures, primarily for speed. Beware.
- X#
- X############################################################################
- X#
- X# See also: retrieve.icn, bmp2text.icn, retrops.icn
- X#
- X############################################################################
- X
- X# Used to store stats for each filename.
- Xrecord Fs(ind_filename, bmp_filename, lim_filename, unt_filename,
- X IS, ofs_table)
- X
- X# IS is declared in indexutl.icn.
- X# global IS
- X
- Xglobal filestats
- X
- Xprocedure initfile(filename)
- X
- X # Messy procedure which creates and stores the names of several
- X # files that will be repeatedly used with "filename." Reads in
- X # the stats for filename from that file's .IS file. Also reads in
- X # the bitmap->offset (.OFS file) table, and puts it into
- X # filestats[filename].ofs_table for later (re-)use. The bitmap->
- X # offset table contains pointers into the .UNT file for filename,
- X # which lists all the main text divisions, with pointers into the
- X # main text file (i.e. filename) for each division. The scheme
- X # is: .OFS file (locates larger divisions) -> .UNT file (contains
- X # the offsets for smaller divisions in filename) -> filename (the
- X # actual compressed text).
- X
- X local IS_filename, in_IS, upto_field, stored_bitmap_length,
- X ofs_filename, intext, cut_down_bitmap, block_size, offset
- X # global filestats
- X initial {
- X filestats := table()
- X # OS-specific parameters are initialized here.
- X initialize_os_params() # in indexutl.icn
- X }
- X
- X # Check for sloppy programming. Did we do this one already??
- X if not (/filestats[filename] := Fs(,,,,,table())) then fail
- X
- X filestats[filename].ind_filename :=
- X dir_name(filename)||create_fname(filename, "IND")
- X filestats[filename].bmp_filename :=
- X dir_name(filename)||create_fname(filename, "BMP")
- X filestats[filename].lim_filename :=
- X dir_name(filename)||create_fname(filename, "LIM")
- X filestats[filename].unt_filename :=
- X dir_name(filename)||create_fname(filename, "UNT")
- X
- X # Decode stored IS record for filename.
- X IS_filename := dir_name(filename)||create_fname(filename, "IS")
- X in_IS := open(IS_filename) | abort("bitmap_2_text",
- X "Can't open "||IS_filename||". Did you forget to index?", 24)
- X filestats[filename].IS := decode(!in_IS)
- X close(in_IS)
- X
- X # Having decoded IS, we can now determine the length of the cut-
- X # down bitmaps stored in the .OFS file for filename.
- X upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
- X stored_bitmap_length :=
- X ((filestats[filename].IS.len * upto_field) <= seq(0,8))
- X
- X # open .OFS file
- X ofs_filename := dir_name(filename)||create_fname(filename, "OFS")
- X intext := open(ofs_filename) |
- X abort("bitmap_2_text", "can't open "||ofs_filename, 23)
- X
- X # read in blocks from .OFS file, breaking them into their
- X # constituent parts
- X while block_size := read_int(intext, 8) * 8 do {
- X cut_down_bitmap := read_int(intext, stored_bitmap_length)
- X offset := read_int(intext, block_size - stored_bitmap_length)
- X insert(filestats[filename].ofs_table, cut_down_bitmap, offset)
- X }
- X close(intext)
- X
- X return *filestats[filename].ofs_table
- X
- Xend
- SHAR_EOF
- true || echo 'restore of initfile.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= retrieve.icn ==============
- if test -f 'retrieve.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping retrieve.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting retrieve.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'retrieve.icn' &&
- X############################################################################
- X#
- X# Name: retrieve.icn
- X#
- X# Title: retrieve locations of words in database file
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.25
- X#
- X############################################################################
- X#
- X# Retrieve(pattern, filename) retrieves all locations containing
- X# words matching pattern (arg1) in filename (arg2), placing them in a
- X# list. "Locations" are integer-coded pointers to places in filename
- X# where corresponding text is located. To actually retrieve that
- X# block of text, you must call bitmap_2_text(location, filename).
- X# Retrieve() only gathers up a list of locations in filename
- X# containing words which match pattern.
- X#
- X# The reason retrieve() doesn't do the logical thing - namely, to
- X# "retrieve" text itself - is that doing so might use a *lot* of
- X# memory. It is far more economical to retrieve text only when a
- X# given chunk is requested via bitmap_2_text().
- X#
- X# Note: Patterns must match words in their entirety. For instance,
- X# retrieve("dog",filename) would only retrieve exact matches for the
- X# word "dog" in filename. To catch, say, "doggie" as well, it would
- X# be necessary to call retrieve with a regular expression that
- X# matched both dog and doggie (e.g. retrieve("dog.*",filename)).
- X#
- X############################################################################
- X#
- X# Links: codeobj.icn, ./indexutl.icn, ./binsrch.icn, ./initfile.icn
- X# ./findre.icn shquote.icn
- X#
- X# See also: makeind.icn, bmp2text.icn
- X#
- X############################################################################
- X
- Xlink codeobj, shquote
- X
- X# The following globals contain stats for current file (here, arg2).
- X# global filestats # declared in initfile.icn
- X# global IS # declared in indexutl.icn
- X
- Xprocedure retrieve(pattern, filename, inverse)
- X
- X local bitmap_set, bmp_file, in_egrep, intext, cmd,
- X offset, pattern2, line
- X static is_UNIX, egrep_filename
- X initial {
- X if is_UNIX := find("UNIX",&features) then
- X # If egrep is available, use it. It's fast.
- X egrep_filename := "egrep"
- X # egrep_filename := "/usr/local/bin/gnuegrep"
- X }
- X
- X # Check for sloppy programming.
- X /filename & abort("retrieve","you called me without a filename",22)
- X
- X # Initialize important variables.
- X #
- X if /filestats | /filestats[filename]
- X then initfile(filename) # see initfile.icn
- X bitmap_set := set() # list will contain locations of hits
- X IS := filestats[filename].IS # re-initialize IS for current file
- X if /IS.is_case_sensitive then
- X pattern := map(pattern)
- X
- X # Open bitmap file.
- X #
- X bmp_file := open(filestats[filename].bmp_filename) |
- X abort("retrieve","can't open "||filestats[filename].bmp_filename, 29)
- X
- X # Search index.
- X #
- X if are_metas(pattern) then {
- X # NB: are_metas() can be found in indexutl.icn
- X
- X # If there are metacharacters in pattern, do a regexp pattern match.
- X # The .IND file goes: line ::= key \t other-stuff.
- X pattern := "^(" || pattern || ")\t"
- X
- X # If UNIX, then use egrep to search index.
- X #
- X if \is_UNIX then {
- X
- X # Set up command line to be passed to /bin/sh. First make
- X # sure we don't have any apostrophe's hanging around to
- X # screw up the command line to be passed to /bin/sh, then
- X # put together a command line to be passed to egrep.
- X pattern2 := shquote(pattern) # from the IPL
- X
- X cmd := egrep_filename || " " || pattern2 ||
- X " " || filestats[filename].ind_filename ||
- X " 2>&1"
- X # open pipe
- X in_egrep := open(cmd, "rp") |
- X abort("retrieve","can't open pipe from\n\t"||cmd, 20)
- X # grep .IND index file
- X every line := !in_egrep do {
- X # Kludge, but it's the only way to tell if there's an error.
- X find("error"|"grep", line) & {
- X # Define some routine here that issues a warning; there
- X # is no need to actually abort!
- X (\err_message)("Regular expression syntax error.") |
- X stop("retrieve: regexp syntax error")
- X break
- X }
- X line ? (tab(find("\t")+1), offset := integer(tab(0)))
- X bitmap_set ++:=
- X retrieve_bitmaps(offset, bmp_file)
- X }
- X close(in_egrep)
- X
- X # ...otherwise (i.e. if not UNIX) use findre() instead of egrep
- X #
- X } else {
- X
- X # Probably MS-DOS or something else. SLOW, SLOW!
- X intext := open(filestats[filename].ind_filename) |
- X abort("retrieve","can't open index file", 21)
- X # grep .IND file
- X every line := !intext do {
- X line ? {
- X if findre(pattern) then {
- X offset := integer(tab(0))
- X bitmap_set ++:=
- X retrieve_bitmaps(offset, bmp_file)
- X }
- X }
- X }
- X every close(bmp_file | intext)
- X }
- X
- X # If *not* are_metas(pattern), then do a binary search of the index.
- X # No need to worry about is_UNIX, egrep, findre(), etc.
- X #
- X } else {
- X if offset :=
- X binary_index_search(pattern, filestats[filename].ind_filename)
- X then bitmap_set ++:=
- X retrieve_bitmaps(offset, bmp_file)
- X }
- X
- X # If inverse (arg 3) is nonnull, then invert the sense of the search.
- X # Do this by knocking out those parts of the full bitmap set that are
- X # in the bitmap_set, and then assigning the result to bitmap_set.
- X #
- X if \inverse then
- X bitmap_set := (all_bitmaps(bmp_file) -- bitmap_set)
- X
- X # We're done. See if there were any hits. If so, sort & return a
- X # list (lists are easier for the display routines to handle).
- X #
- X close(bmp_file)
- X #
- X if *bitmap_set > 0
- X then return sort(bitmap_set)
- X else fail
- X
- Xend
- X
- X
- X
- Xprocedure retrieve_bitmaps(offset, f, return_a_list)
- X
- X local bitmap_list, bitmap_length, i, tmp, how_many_bitmaps,
- X bits_needed, inverse_signal
- X
- X bits_needed := 24
- X inverse_signal := 8388608
- X
- X seek(f, offset)
- X bitmap_length := ((IS.len * IS.no) <= seq(0,8))
- X tmp := read_int(f, bits_needed)
- X how_many_bitmaps := iand(inverse_signal-1, tmp)
- X
- X # Slower way.
- X # bitmap_list := list(how_many_bitmaps)
- X # every i := 1 to how_many_bitmaps do
- X # bitmap_list[i] := read_int(f, bitmap_length)
- X
- X # Slow way.
- X bitmap_list := list()
- X every i := 1 to how_many_bitmaps do
- X put(bitmap_list, read_int(f, bitmap_length))
- X
- X # If the inverse signal bit is turned on, then the BMP file stores
- X # non-occurrences for a given key (rather than occurrences). Saves
- X # space for a/the/and, etc., but necessitates collecting all bitmaps
- X # for the current file into a set a set difference. The procedure
- X # all_bitmaps does the collecting.
- X if iand(inverse_signal, tmp) ~= 0 then {
- X bitmap_list := (all_bitmaps(f) -- set(bitmap_list))
- X }
- X
- X if \return_a_list
- X then return sort(bitmap_list)
- X else {
- X if type(bitmap_list) == "list"
- X then return set(bitmap_list)
- X else return bitmap_list
- X }
- X
- Xend
- X
- X
- X
- Xprocedure all_bitmaps(f, return_a_list)
- X
- X # At offset 1 in the BMP file is the list of all bitmaps in the
- X # full file. Returns the set of these, unless a list is desired,
- X # in which case one must call all_bitmaps() with a nonnull second
- X # argument.
- X return retrieve_bitmaps(1, f, return_a_list)
- X
- Xend
- SHAR_EOF
- true || echo 'restore of retrieve.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= indexutl.icn ==============
- if test -f 'indexutl.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping indexutl.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting indexutl.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'indexutl.icn' &&
- X############################################################################
- X#
- X# Name: indexutl.icn
- X#
- X# Title: indexing utilities
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 2.1
- X#
- X############################################################################
- X#
- X# This file contains base_name(), dir_name(), get_index_fname(),
- X# stripchars(), abort(), and gettokens().
- X#
- X# base_name(s), dir_name(s) - like the Unix system commands
- X# create_fname(fname,ext) - get a new filename based on fname + ext
- X# stripchars(s,c) - strip chars c from string s
- X# abort(proc,msg,ecode) - abort procedure proc with exit code ecode
- X# write_int(f, int, size) - breaks int into 8-bit chunks & writes to f
- X# read_int(f, int, size) - like write_int, only constructs int from f
- X# are_metas(pattern) - succeeds if pattern has egrep-style metas
- X# digits_2_bitmap(s) - converts string 01:13:94 to an int-bitmap
- X#
- X############################################################################
- X#
- X# Links: ./findre.icn, radcon.icn, bincvt.icn
- X#
- X# See also: retrieve.icn, retrops.icn, bmp2text.icn, makeind.icn
- X#
- X############################################################################
- X
- X#
- X# All from the IPL.
- X#
- Xlink radcon, bincvt
- X
- X#
- X# FS = field separator, s_len = string length of fields, len =
- X# byte length of fields, no = number of fields, is_case_sensitive =
- X# whether to map index entries to lowercase, r_field = rollover
- X# field for limits file.
- X#
- Xrecord is(FS, s_len, len, no, is_case_sensitive, r_field, hufftree)
- Xglobal _slash, _baselen, IS
- X
- X
- Xprocedure base_name(s)
- X
- X # If s == "/usr/local/man/man1/icon.1", base_name will return
- X # "icon.1". Somewhat like the Unix basename system command.
- X
- X # global _slash # _slash = \ for MS-DOS, / for Unix
- X s ? {
- X while tab(find(_slash)+1)
- X return tab(0)
- X }
- X
- Xend
- X
- X
- X
- Xprocedure dir_name(s)
- X
- X # If s == "/usr/local/man/man1/icon.1", dir_name will return
- X # "/usr/local/man/man1". Somewhat like the Unix dirname system
- X # command.
- X
- X local s2
- X # global _slash # _slash = \ for MS-DOS, / for Unix
- X
- X s2 := ""
- X s ? {
- X while s2 ||:= tab(find(_slash)+1)
- X return s2
- X }
- X
- Xend
- X
- X
- X
- Xprocedure create_fname(FNAME, EXT)
- X
- X #
- X # Discard path component. Cut basename down to a small enough
- X # size that the OS will be able to handle addition of the ex-
- X # tension, EXT.
- X #
- X
- X # global _slash, _baselen
- X
- X *EXT > 3 &
- X abort("get_index_fname","extension too long",7)
- X
- X return right(
- X stripchars(base_name(FNAME,_slash),'.'), _baselen, "x") ||
- X "." || EXT
- X
- Xend
- X
- X
- X
- Xprocedure stripchars(s,c)
- X
- X # Strip chars (c) from string (s). Return stripped s.
- X
- X local s2
- X
- X s2 := ""
- X s ? {
- X while s2 ||:= tab(upto(c))
- X do tab(many(c))
- X s2 ||:= tab(0)
- X }
- X return s2
- X
- Xend
- X
- X
- X
- Xprocedure abort(proc_name, message, error_code)
- X
- X if not (/proc_name := "") then
- X proc_name := trim(proc_name, ': ') || ": "
- X /error_code := 1
- X
- X write(&errout, proc_name, \message) # fail if there's no error msg,
- X exit(error_code) # then abort
- X
- Xend
- X
- X
- X
- Xprocedure write_int(f, i, size)
- X
- X # Write out an integer byte-by-byte.
- X #
- X # Important little routine. I know it looks inelegant and slow.
- X # Feel free to modify it for speed, and send me the results.
- X # Don't knock out the old code, though. You understood it when
- X # you read it, right? That's the idea :-).
- X
- X local marker, how_many
- X
- X marker := ""
- X how_many := 0
- X /size := (*exbase10(i,2) <= seq(0,8))
- X
- X # output bytes most significant first; then least significant
- X until (size -:= 8) <= -8 do {
- X how_many +:= 1
- X marker ||:= (f, char(iand(ishift(i, -size), 2r11111111)))
- X }
- X
- X writes(f, marker)
- X return how_many # number of characters written
- X
- Xend
- X
- X
- X
- Xprocedure read_int(f, size)
- X
- X local i, _shift
- X
- X # collect bytes, putting the first one read into the high
- X # end of an integer, and on down to the last read (into the
- X # low end)
- X i := _shift := 0
- X while (_shift +:= 8) <= size do
- X i +:= ishift(ord(reads(f)), size - _shift) | fail
- X return i
- X
- Xend
- X
- X
- X
- Xprocedure initialize_os_params()
- X
- X local os
- X # global _slash, _baselen
- X
- X if find("MS-DOS", os := &features) then {
- X _slash := "\\"; _baselen := 8
- X }
- X else if find("UNIX", os := &features) then {
- X _slash := "/"; _baselen := 10
- X }
- X else abort("initialize_os_params","os parameters undefined", 6)
- X
- X return os
- X
- Xend
- X
- X
- Xprocedure are_metas(str)
- X
- X local chr, tmp
- X
- X str ? {
- X
- X # String-initial metacharacters are meaningless.
- X tab(many('*+?|'))
- X
- X # Look for metacharacters and backslashes.
- X while tab(upto('\\*+()|?.$^[')) do {
- X
- X # If a backslash comes first, then the next character can't
- X # be a meta. Move past it, and try again.
- X if ="\\" then move(1) |
- X abort("are_metas","malformed \-escape sequence",19)
- X # Otherwise, we have a metacharacter. Return its position
- X # in str. Dereference just so as not to have a global var.
- X # on the loose.
- X else return .&pos
- X }
- X
- X }
- X
- X # If we've gotten this far without returning, then the string is
- X # clean of metacharacters, and (in boolean terms) the procedure
- X # are_metas() returns false.
- X fail
- X
- Xend
- X
- X
- X#
- X# digits_2_bitmap
- X#
- X# Converts a string representation of a set of bit-fields into an
- X# integer. I.e. 1:1:3 becomes binary 010111 (decimal 23). This
- X# integer is like a map, and is called, in text-processing circles,
- X# a bitmap (not to be confused with bit-mapped display techniques).
- X#
- Xprocedure digits_2_bitmap(s)
- X
- X # s = location string (e.g. 10:02:03:75)
- X # IS.s_len = the string length of fields in s (3 in the above example)
- X # IS.len = the number of bits needed to hold an integer
- X # representation of a single field
- X # IS.no = number of fields in s (4 in the above example)
- X #
- X # Fixed field lengths make things much simpler, but a whole
- X # helluva lot less economical. Be sure that (IS.len * IS.no) does
- X # not exceed the register width for your CPU if either a) your
- X # implementation has no limits on the size of integers, or b) you
- X # are really concerned about performance. Otherwise, never mind.
- X
- X local bitmap, field, no
- X
- X no := IS.no
- X bitmap := 0
- X
- X s ? {
- X if upto(~&digits) then {
- X # The bitmap is delineated by field-markers (e.g. 11;23).
- X tab(upto(&digits))
- X while field := tab(many(&digits)) do {
- X no -:= 1
- X tab(upto(&digits))
- X bitmap +:= ishift(field, no * IS.len)
- X }
- X } else {
- X # Yuck! An un-delineated bitmap (e.g. 23423).
- X while field := integer(move(IS.s_len)) do {
- SHAR_EOF
- true || echo 'restore of indexutl.icn failed'
- fi
- echo 'End of part 4'
- echo 'File indexutl.icn is continued in part 5'
- echo 5 > _shar_seq_.tmp
- exit 0
-
- exit 0 # Just in case...
- --
- Kent Landfield INTERNET: kent@sparky.IMD.Sterling.COM
- Sterling Software, IMD UUCP: uunet!sparky!kent
- Phone: (402) 291-8300 FAX: (402) 291-4362
- Please send comp.sources.misc-related mail to kent@uunet.uu.net.
-