home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
CP/M
/
CPM_CDROM.iso
/
cpm
/
cpm68k
/
arc68k.arc
/
ARCSQ.C
< prev
next >
Wrap
Text File
|
1987-11-27
|
18KB
|
585 lines
/*
* arcsq.c 1.1
*
* Author: Thom Henderson
* Original System V port: Mike Stump
* Enhancements, Bug fixes, and cleanup: Chris Seaman
* Date: Fri Mar 20 09:57:02 1987
* Last Mod. 3/21/87
*
*/
/*
* ARC - Archive utility - ARCSQ
*
* Version 3.10, created on 01/30/86 at 20:10:46
*
* (C) COPYRIGHT 1985 by System Enhancement Associates; ALL RIGHTS RESERVED
*
* Description:
* This file contains the routines used to squeeze a file
* when placing it in an archive.
*
* Programming notes:
* Most of the routines used for the Huffman squeezing algorithm
* were lifted from the SQ program by Dick Greenlaw, as adapted
* to CI-C86 by Robert J. Beilstein.
*/
#include "arc.h"
/* stuff for Huffman squeezing */
#define TRUE 1
#define FALSE 0
#define ERROR (-1)
#define SPEOF 256 /* special endfile token */
#define NOCHILD (-1) /* marks end of path through tree */
#define NUMVALS 257 /* 256 data values plus SPEOF*/
#define NUMNODES (NUMVALS+NUMVALS-1) /* number of nodes */
#define MAXCOUNT (unsigned INT) 65535 /* biggest unsigned integer */
/*
* The following array of structures are the nodes of the
* binary trees. The first NUMVALS nodes become the leaves of the
* final tree and represent the values of the data bytes being
* encoded and the special endfile, SPEOF.
* The remaining nodes become the internal nodes of the final tree.
*/
struct nd { /* shared by unsqueezer */
unsigned INT weight; /* number of appearances */
INT tdepth; /* length on longest path in tree */
INT lchild, rchild; /* indices to next level */
} node[NUMNODES]; /* use large buffer */
static INT dctreehd; /* index to head of final tree */
/*
* This is the encoding table:
* The bit strings have first bit in low bit.
* Note that counts were scaled so code fits unsigned integer.
*/
static INT codelen[NUMVALS]; /* number of bits in code */
static unsigned INT code[NUMVALS]; /* code itself, right adjusted */
static unsigned INT tcode; /* temporary code value */
static long valcount[NUMVALS]; /* actual count of times seen */
/* Variables used by encoding process */
static INT curin; /* value currently being encoded */
static INT cbitsrem; /* # of code string bits left */
static unsigned INT ccode; /* current code right justified */
INT init_sq() /* prepare for scanning pass */
{
INT i; /* node index */
/*
* Initialize all nodes to single element binary trees
* with zero weight and depth.
*/
for (i=0; i<NUMNODES; ++i)
{
node[i].weight = 0;
node[i].tdepth = 0;
node[i].lchild = NOCHILD;
node[i].rchild = NOCHILD;
}
for (i=0; i<NUMVALS; i++)
valcount[i] = 0;
}
INT scan_sq(c) /* add a byte to the tables */
INT c; /* byte to add */
{
unsigned INT *wp; /* speeds up weight counting */
/* Build frequency info in tree */
if (c == EOF) /* it's traditional */
c = SPEOF; /* dumb, but traditional */
if (*(wp = &node[c].weight) != MAXCOUNT)
++(*wp); /* bump weight counter */
valcount[c]++; /* bump byte counter */
}
long pred_sq() /* predict size of squeezed file */
{
INT i;
INT btlist[NUMVALS]; /* list of intermediate b-trees */
INT listlen; /* length of btlist */
unsigned INT ceiling; /* limit for scaling */
long size; /* predicted size */
INT numnodes; /* # of nodes in simplified tree */
INT scale();
INT heap();
INT bld_tree();
INT buildenc();
INT init_enc();
size = 0;
scan_sq(EOF); /* signal end of input */
ceiling = MAXCOUNT;
/* Keep trying to scale and encode */
do
{
scale(ceiling);
ceiling /= 2; /* in case we rescale */
/*
* Build list of single node binary trees having
* leaves for the input values with non-zero counts
*/
for (i=listlen=0; i<NUMVALS; ++i)
{
if (node[i].weight != 0)
{
node[i].tdepth = 0;
btlist[listlen++] = i;
}
}
/*
* Arrange list of trees into a heap with the entry
* indexing the node with the least weight at the top.
*/
heap(btlist,listlen);
/* Convert the list of trees to a single decoding tree */
bld_tree(btlist,listlen);
/* Initialize the encoding table */
init_enc();
/*
* Try to build encoding table.
* Fail if any code is > 16 bits long.
*/
}
while (buildenc(0,dctreehd) == ERROR);
/* Initialize encoding variables */
cbitsrem = 0; /* force initial read */
curin = 0; /* anything but endfile */
for (i=0; i<NUMVALS; i++) /* add bits for each code */
size += valcount[i] * codelen[i];
size = (size+7)/8; /* reduce to number of bytes */
numnodes = dctreehd<NUMVALS ? 0 : dctreehd-(NUMVALS-1);
size += sizeof(INT) + 2*numnodes*sizeof(INT);
return(size);
}
/*
* The count of number of occurrances of each input value
* have already been prevented from exceeding MAXCOUNT.
* Now we must scale them so that their sum doesn't exceed
* ceiling and yet no non-zero count can become zero.
* This scaling prevents errors in the weights of the
* interior nodes of the Huffman tree and also ensures that
* the codes will fit in an unsigned integer. Rescaling is
* used if necessary to limit the code length.
*/
static INT scale(ceil)
unsigned INT ceil; /* upper limit on total weight */
{
register INT i;
INT ovflw, divisor;
unsigned INT w, sum;
unsigned INT increased; /* flag */
do
{
for (i=sum=ovflw=0; i<NUMVALS; ++i)
{
if (node[i].weight > (ceil-sum))
++ovflw;
sum += node[i].weight;
}
divisor = ovflw + 1;
/* Ensure no non-zero values are lost */
increased = FALSE;
for (i=0; i<NUMVALS; ++i)
{
w = node[i].weight;
if (w<divisor && w!=0)
{
/* Don't fail to provide a code if it's used at all */
node[i].weight = divisor;
increased = TRUE;
}
}
}
while (increased);
/* Scaling factor choosen, now scale */
if (divisor>1)
for (i=0; i<NUMVALS; ++i)
node[i].weight /= divisor;
}
/*
* heap() and adjust() maintain a list of binary trees as a
* heap with the top indexing the binary tree on the list
* which has the least weight or, in case of equal weights,
* least depth in its longest path. The depth part is not
* strictly necessary, but tends to avoid long codes which
* might provoke rescaling.
*/
static INT heap(list,length)
INT list[], length;
{
register INT i;
INT adjust();
for (i=(length-2)/2; i>=0; --i)
adjust(list,i,length-1);
}
/* Make a heap from a heap with a new top */
static INT adjust(list,top,bottom)
INT list[], top, bottom;
{
register INT k, temp;
INT cmptrees();
k = 2 * top + 1; /* left child of top */
temp = list[top]; /* remember root node of top tree */
if (k<=bottom)
{
if (k<bottom && cmptrees(list[k],list[k+1]))
++k;
/* k indexes "smaller" child (in heap of trees) of top */
/* now make top index "smaller" of old top and smallest child */
if (cmptrees(temp,list[k]))
{
list[top] = list[k];
list[k] = temp;
/* Make the changed list a heap */
adjust(list,k,bottom); /* recursive */
}
}
}
/*
* Compare two trees, if a > b return true, else return false.
* Note comparison rules in previous comments.
*/
static INT cmptrees(a,b)
INT a, b; /* root nodes of trees */
{
if (node[a].weight > node[b].weight)
return(TRUE);
if (node[a].weight == node[b].weight)
if (node[a].tdepth > node[b].tdepth)
return(TRUE);
return(FALSE);
}
/*
* HUFFMAN ALGORITHM: develops the single element trees
* into a single binary tree by forming subtrees rooted in
* interior nodes having weights equal to the sum of weights of all
* their descendents and having depth counts indicating the
* depth of their longest paths.
*
* When all trees have been formed into a single tree satisfying
* the heap property (on weight, with depth as a tie breaker)
* then the binary code assigned to a leaf (value to be encoded)
* is then the series of left (0) and right (1)
* paths leading from the root to the leaf.
* Note that trees are removed from the heaped list by
* moving the last element over the top element and
* reheaping the shorter list.
*/
static INT bld_tree(list,len)
INT list[];
INT len;
{
register INT freenode; /* next free node in tree */
register struct nd *frnp; /* free node pointer */
INT lch, rch; /* temps for left, right children */
INT maxchar();
/*
* Initialize index to next available (non-leaf) node.
* Lower numbered nodes correspond to leaves (data values).
*/
freenode = NUMVALS;
while (len>1)
{
/*
* Take from list two btrees with least weight
* and build an interior node pointing to them.
* This forms a new tree.
*/
lch = list[0]; /* This one will be left child */
/* delete top (least) tree from the list of trees */
list[0] = list[--len];
adjust(list,0,len-1);
/* Take new top (least) tree. Reuse list slot later */
rch = list[0]; /* This one will be right child */
/*
* Form new tree from the two least trees using
* a free node as root. Put the new tree in the list.
*/
frnp = &node[freenode]; /* address of next free node */
list[0] = freenode++; /* put at top for now */
frnp->lchild = lch;
frnp->rchild = rch;
frnp->weight = node[lch].weight + node[rch].weight;
frnp->tdepth = 1 + maxchar(node[lch].tdepth, node[rch].tdepth);
/* reheap list to get least tree at top */
adjust(list,0,len-1);
}
dctreehd = list[0]; /* head of final tree */
}
static INT maxchar(a,b)
{
return(a>b ? a : b);
}
static INT init_enc()
{
register INT i;
for (i=0; i<NUMVALS; ++i) /* Initialize encoding table */
codelen[i] = 0;
}
/*
* Recursive routine to walk the indicated subtree and level
* and maintain the current path code in bstree. When a leaf
* is found the entire code string and length are put into
* the encoding table entry for the leaf's data value .
*
* Returns ERROR if codes are too long.
*/
static INT buildenc(level,root)
INT level; /* level being examined */
INT root; /* root of subtree/data value if leaf*/
{
register INT l, r;
l = node[root].lchild;
r = node[root].rchild;
if (l==NOCHILD && r==NOCHILD)
{
/*
* Leaf. Previous path determines bit string
* code of length level (bits 0 to level - 1).
* Ensures unused code bits are zero.
*/
codelen[root] = level;
code[root] = tcode & (((unsigned INT)~0) >> (16-level));
return((level>16) ? ERROR : NULL);
}
else
{
if (l!=NOCHILD)
{
/* Clear path bit and continue deeper */
tcode &= ~(1 << level);
if (buildenc(level+1,l)==ERROR)
return(ERROR); /* pass back bad statuses */
}
if (r!=NOCHILD)
{
/* Set path bit and continue deeper */
tcode |= 1 << level;
if (buildenc(level+1,r)==ERROR)
return(ERROR); /* pass back bad statuses */
}
}
return(NULL); /* it worked if we reach here */
}
static INT put_int(n,f) /* output an integer */
INT n; /* integer to output */
FILE *f; /* file to put it to */
{
putc_pak(n&0xff,f); /* first the low byte */
putc_pak(n>>8,f); /* then the high byte */
}
/* Write out the header of the compressed file */
static long wrt_head(ob)
FILE *ob;
{
register INT l,r;
INT i, k;
INT numnodes; /* # of nodes in simplified tree */
/*
* Write out a simplified decoding tree. Only the interior
* nodes are written. When a child is a leaf index
* (representing a data value) it is recoded as
* -(index + 1) to distinguish it from interior indexes
* which are recoded as positive indexes in the new tree.
*
* Note that this tree will be empty for an empty file.
*/
numnodes = dctreehd<NUMVALS ? 0 : dctreehd-(NUMVALS-1);
put_int(numnodes,ob);
for (k=0, i=dctreehd; k<numnodes; ++k, --i)
{
l = node[i].lchild;
r = node[i].rchild;
l = l<NUMVALS ? -(l+1) : dctreehd-l;
r = r<NUMVALS ? -(r+1) : dctreehd-r;
put_int(l,ob);
put_int(r,ob);
}
return(sizeof(INT) + numnodes*2*sizeof(INT));
}
/*
* Get an encoded byte or EOF. Reads from specified stream AS NEEDED.
*
* There are two unsynchronized bit-byte relationships here.
* The input stream bytes are converted to bit strings of
* various lengths via the static variables named c...
* These bit strings are concatenated without padding to
* become the stream of encoded result bytes, which this
* function returns one at a time. The EOF (end of file) is
* converted to SPEOF for convenience and encoded like any
* other input value. True EOF is returned after that.
*/
static INT gethuff(ib) /* Returns bytes except for EOF */
FILE *ib;
{
INT rbyte; /* Result byte value */
INT need; /* numbers of bits */
rbyte = 0;
need = 8; /* build one byte per call */
/*
* Loop to build a byte of encoded data.
* Initialization forces read the first time.
*/
while (1)
{
if (cbitsrem>=need) /* if current code is big enough */
{
if (need==0)
return(rbyte);
rbyte |= ccode << (8-need);/* take what we need */
ccode >>= need; /* and leave the rest */
cbitsrem -= need;
return(rbyte & 0xff);
}
/* We need more than current code */
if (cbitsrem>0)
{
rbyte |= ccode << (8-need);/* take what there is */
need -= cbitsrem;
}
/* No more bits in current code string */
if (curin==SPEOF)
{
/*
* The end of file token has been encoded. If
* result byte has data return it and do EOF next time.
*/
cbitsrem = 0;
return((need==8) ? EOF : rbyte + 0);
}
/* Get an input byte */
if ((curin=getc_ncr(ib)) == EOF)
curin = SPEOF; /* convenient for encoding */
ccode = code[curin]; /* get the new byte's code */
cbitsrem = codelen[curin];
}
}
/*
* This routine is used to perform the actual squeeze operation. It can
* only be called after the file has been scanned. It returns the true
* length of the squeezed entry.
*/
long file_sq(f,t) /* squeeze a file into an archive */
FILE *f; /* file to squeeze */
FILE *t; /* archive to receive file */
{
INT c; /* one byte of squeezed data */
long size; /* size after squeezing */
size = wrt_head(t); /* write out the decode tree */
while ((c=gethuff(f))!=EOF)
{
putc_pak(c,t);
size++;
}
return(size); /* report true size */
}