home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Fresh Fish 8
/
FreshFishVol8-CD2.bin
/
bbs
/
gnu
/
ispell-4.0-src.lha
/
ispell-4.0
/
build.c
< prev
next >
Wrap
C/C++ Source or Header
|
1993-06-01
|
11KB
|
571 lines
/* Copyright (C) 1990, 1993 Free Software Foundation, Inc.
This file is part of GNU ISPELL.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
#include <stdio.h>
#include <ctype.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#include "ispell.h"
#include "hash.h"
#include "build.h"
#include "getopt.h"
int print = 0;
#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
#endif
unsigned char *tmp_hused, *hused;
#define BITVECSIZE 8192 /* 64K bits */
unsigned short hused_limit;
char inbuf[100];
comp_char outbuf[100];
struct hash_table_entry hte;
#ifdef __STDC__
unsigned short parse_flags (comp_char *, char *);
hash_index find_slot (hash_index);
void toomanywords (void);
void mklextbl (char *, FILE *);
void hash_write (FILE *);
void hash_awrite (FILE *);
void hash_ewrite (FILE *);
int lexword (char *, int, unsigned char *);
void set_bitvec (unsigned char *, unsigned short);
#else
extern unsigned short parse_flags ();
extern hash_index find_slot ();
extern void toomanywords ();
extern void hash_write ();
extern void hash_awrite ();
extern void hash_ewrite ();
extern int lexword ();
#endif
int binary_flag = 0;
int ascii_flag = 0;
int reap_flag = 0;
int debug_flag = 0;
char buildout[100];
/* this array contains letters to use when generating near misses */
char near_miss_letters[256];
int nnear_miss_letters;
/* this array has 1 for any character that is in near_miss_letters */
char near_map[256];
int
main (argc, argv)
int argc;
char **argv;
{
FILE *in;
#ifdef __STDC__
void reapall (void);
#else
extern void reapall ();
#endif
int c;
extern char *optarg;
extern int optind;
char *name;
char *freqfile = NULL;
buildout[0] = 0;
while ((c = getopt (argc, argv, "prbao:f:d")) != EOF)
{
switch (c)
{
case 'd':
debug_flag = 1;
break;
case 'f':
freqfile = optarg;
break;
case 'p':
print = 1;
break;
case 'r':
reap_flag = 1;
break;
case 'b':
binary_flag = 1;
break;
case 'a':
ascii_flag = 1;
break;
case 'o':
(void) strcpy (buildout, optarg);
break;
default:
/* message printed by getopt */
exit (1);
}
}
if (freqfile == NULL)
{
(void) fprintf (stderr, "no freq file specified\n");
exit (1);
}
name = argv[optind];
if (binary_flag == 0 && ascii_flag == 0)
{
(void) fprintf (stderr, "must have either -a or -b\n");
exit (1);
}
if (debug_flag)
{
mklextbl (freqfile, stdout);
exit (0);
}
mklextbl (freqfile, (FILE *) NULL);
if ((in = fopen (name, "r")) == NULL)
{
(void) fprintf (stderr, "can't open %s\n", name);
exit (1);
}
build (in);
(void) fclose (in);
if (reap_flag)
reapall ();
if (binary_flag)
{
if (buildout[0] == 0)
{
strcpy (buildout, name);
strcat (buildout, ".hsh");
}
write_binary (buildout);
}
if (ascii_flag)
{
if (buildout[0] == 0)
{
strcpy (buildout, name);
strcat (buildout, ".f");
}
write_ascii (buildout);
}
return (0);
}
void
write_binary (name)
char *name;
{
FILE *out;
char *mode;
#ifdef MSDOS
mode = "wb";
#else
mode = "w";
#endif
if ((out = fopen (name, mode)) == NULL)
{
(void) fprintf (stderr, "can't create %s\n", name);
exit (1);
}
hash_write (out);
fclose (out);
}
void
write_ascii (name)
char *name;
{
FILE *out;
if ((out = fopen (name, "w")) == NULL)
{
(void) fprintf (stderr, "can't create %s\n", name);
exit (1);
}
hash_awrite (out);
if (ferror (out))
{
(void) fprintf (stderr, "error writing %s\n", name);
exit (1);
}
fclose (out);
}
void
build (in)
FILE *in;
{
int len;
struct hash_table_entry *htep;
unsigned short nwords = 0;
int i;
hash_index h, prev_hindex;
char *p;
unsigned short flags;
unsigned short strsize = 0;
#ifdef __STDC__
unsigned short nextprime (unsigned short);
#else
extern unsigned short nextprime ();
#endif
tmp_hused = (unsigned char *) xcalloc (1, BITVECSIZE);
(void) fprintf (stderr, "Counting words: ");
while (fgets (inbuf, (int) sizeof inbuf, in) != NULL)
{
if (inbuf[0] == '#')
continue;
p = (char *) strchr (inbuf, '/');
if (p)
*p = 0;
p = (char *) strchr (inbuf, '\n');
if (p)
*p = 0;
if (inbuf[0] == 0)
continue;
if (strlen (inbuf) >= MAX_WORD_LEN)
{
(void) fprintf (stderr, "warning: %s too long\n", inbuf);
continue;
}
len = lexword (inbuf, strlen (inbuf), outbuf);
if (len == 0)
{
(void) fprintf (stderr,
"%s can't be compressed; try re-running 'freq'\n",
inbuf);
continue;
}
if (nwords >= HASH_SPECIAL - 1)
toomanywords ();
if ((nwords % 1000) == 0)
{
(void) fprintf (stderr, "%u ", nwords);
(void) fflush (stderr);
}
nwords++;
if (len > sizeof hte.u.s.data)
strsize += len - sizeof hte.u.l.data;
h = hash ((char *) outbuf);
set_bitvec (tmp_hused, h);
}
nwords++; /* add one since slot 0 is reserved */
(void) fprintf (stderr, "%u words\n", nwords);
hashsize = nextprime (nwords);
if (hashsize >= HASH_SPECIAL)
toomanywords ();
hused_limit = (hashsize + 7) >> 3;
hused = (unsigned char *) xcalloc (1, hused_limit);
/* make the slots that were introduced when rounding up
* by 8 be busy
*/
for (h = hashsize; h < hused_limit * 8; h++)
set_bitvec (hused, h);
for (i = 0, h = 0; i < BITVECSIZE; i++)
{
unsigned char x = tmp_hused[i];
unsigned short bit;
for (bit = 1; bit != 0x100; bit <<= 1, h++)
{
if (x & bit)
{
hash_index real_hash = h % hashsize;
set_bitvec (hused, real_hash);
}
}
}
/* now hused has 1 where ever something will hash to */
free ((char *) tmp_hused);
tmp_hused = NULL;
alloc_tables ((long) hashsize, (long) strsize);
set_bitvec (hused, 0);
htep = (struct hash_table_entry *) xcalloc (1, sizeof *htep);
htep->next = HASH_END;
hash_store (0, htep);
free ((char *) htep);
rewind (in);
(void) fprintf (stderr, "Starting hash: ");
nwords = 0;
while (fgets (inbuf, (int) sizeof inbuf, in) != NULL)
{
if ((nwords % 1000) == 0)
{
(void) fprintf (stderr, "%u ", nwords);
(void) fflush (stderr);
}
nwords++;
flags = 0;
p = (char *) strchr (inbuf, '/');
if (p)
{
flags = parse_flags (inbuf, p);
*p = 0;
}
p = (char *) strchr (inbuf, '\n');
if (p)
*p = 0;
if (inbuf[0] == 0)
continue;
if (strlen (inbuf) >= MAX_WORD_LEN)
continue;
len = lexword (inbuf, strlen (inbuf), outbuf);
if (len == 0)
continue;
h = hash ((char *) outbuf) % hashsize;
if (print)
(void) printf ("%s hash %u; ", inbuf, h);
if (!hash_emptyp (h))
{
if (print)
(void) printf ("already used ");
prev_hindex = hash_find_end (h);
if (print)
(void) printf ("tail %u; ", prev_hindex);
h = find_slot (prev_hindex);
if (print)
(void) printf ("new slot %u; ", h);
hash_set_next (prev_hindex, h);
}
htep = make_hash_table_entry (outbuf, len);
hash_store (h, htep);
store_flags (h, flags);
if (print)
(void) printf ("\n");
}
(void) printf ("\n");
free ((char *) hused);
}
void
toomanywords ()
{
(void) fprintf (stderr, "too many words, max = %u\n", HASH_SPECIAL - 1);
exit (1);
}
#if 0
get_bitvec (bitvec, n)
unsigned char *bitvec;
unsigned short n;
{
static unsigned char bittbl[] =
{1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80};
return (bitvec[