CP/M

home *** CD-ROM | disk | FTP | other *** search

/ CP/M / CPM_CDROM.iso / cpm / cpm68k / arc68k.arc / ARCSQ.C < prev next >

Wrap

Text File | 1987-11-27 | 18KB | 585 lines

/* * arcsq.c 1.1 * * Author: Thom Henderson * Original System V port: Mike Stump * Enhancements, Bug fixes, and cleanup: Chris Seaman * Date: Fri Mar 20 09:57:02 1987 * Last Mod. 3/21/87 * */ /* * ARC - Archive utility - ARCSQ * * Version 3.10, created on 01/30/86 at 20:10:46 * * (C) COPYRIGHT 1985 by System Enhancement Associates; ALL RIGHTS RESERVED * * Description: * This file contains the routines used to squeeze a file * when placing it in an archive. * * Programming notes: * Most of the routines used for the Huffman squeezing algorithm * were lifted from the SQ program by Dick Greenlaw, as adapted * to CI-C86 by Robert J. Beilstein. */ #include "arc.h" /* stuff for Huffman squeezing */ #define TRUE 1 #define FALSE 0 #define ERROR (-1) #define SPEOF 256 /* special endfile token */ #define NOCHILD (-1) /* marks end of path through tree */ #define NUMVALS 257 /* 256 data values plus SPEOF*/ #define NUMNODES (NUMVALS+NUMVALS-1) /* number of nodes */ #define MAXCOUNT (unsigned INT) 65535 /* biggest unsigned integer */ /* * The following array of structures are the nodes of the * binary trees. The first NUMVALS nodes become the leaves of the * final tree and represent the values of the data bytes being * encoded and the special endfile, SPEOF. * The remaining nodes become the internal nodes of the final tree. */ struct nd { /* shared by unsqueezer */ unsigned INT weight; /* number of appearances */ INT tdepth; /* length on longest path in tree */ INT lchild, rchild; /* indices to next level */ } node[NUMNODES]; /* use large buffer */ static INT dctreehd; /* index to head of final tree */ /* * This is the encoding table: * The bit strings have first bit in low bit. * Note that counts were scaled so code fits unsigned integer. */ static INT codelen[NUMVALS]; /* number of bits in code */ static unsigned INT code[NUMVALS]; /* code itself, right adjusted */ static unsigned INT tcode; /* temporary code value */ static long valcount[NUMVALS]; /* actual count of times seen */ /* Variables used by encoding process */ static INT curin; /* value currently being encoded */ static INT cbitsrem; /* # of code string bits left */ static unsigned INT ccode; /* current code right justified */ INT init_sq() /* prepare for scanning pass */ { INT i; /* node index */ /* * Initialize all nodes to single element binary trees * with zero weight and depth. */ for (i=0; i<NUMNODES; ++i) { node[i].weight = 0; node[i].tdepth = 0; node[i].lchild = NOCHILD; node[i].rchild = NOCHILD; } for (i=0; i<NUMVALS; i++) valcount[i] = 0; } INT scan_sq(c) /* add a byte to the tables */ INT c; /* byte to add */ { unsigned INT *wp; /* speeds up weight counting */ /* Build frequency info in tree */ if (c == EOF) /* it's traditional */ c = SPEOF; /* dumb, but traditional */ if (*(wp = &node[c].weight) != MAXCOUNT) ++(*wp); /* bump weight counter */ valcount[c]++; /* bump byte counter */ } long pred_sq() /* predict size of squeezed file */ { INT i; INT btlist[NUMVALS]; /* list of intermediate b-trees */ INT listlen; /* length of btlist */ unsigned INT ceiling; /* limit for scaling */ long size; /* predicted size */ INT numnodes; /* # of nodes in simplified tree */ INT scale(); INT heap(); INT bld_tree(); INT buildenc(); INT init_enc(); size = 0; scan_sq(EOF); /* signal end of input */ ceiling = MAXCOUNT; /* Keep trying to scale and encode */ do { scale(ceiling); ceiling /= 2; /* in case we rescale */ /* * Build list of single node binary trees having * leaves for the input values with non-zero counts */ for (i=listlen=0; i<NUMVALS; ++i) { if (node[i].weight != 0) { node[i].tdepth = 0; btlist[listlen++] = i; } } /* * Arrange list of trees into a heap with the entry * indexing the node with the least weight at the top. */ heap(btlist,listlen); /* Convert the list of trees to a single decoding tree */ bld_tree(btlist,listlen); /* Initialize the encoding table */ init_enc(); /* * Try to build encoding table. * Fail if any code is > 16 bits long. */ } while (buildenc(0,dctreehd) == ERROR); /* Initialize encoding variables */ cbitsrem = 0; /* force initial read */ curin = 0; /* anything but endfile */ for (i=0; i<NUMVALS; i++) /* add bits for each code */ size += valcount[i] * codelen[i]; size = (size+7)/8; /* reduce to number of bytes */ numnodes = dctreehd<NUMVALS ? 0 : dctreehd-(NUMVALS-1); size += sizeof(INT) + 2*numnodes*sizeof(INT); return(size); } /* * The count of number of occurrances of each input value * have already been prevented from exceeding MAXCOUNT. * Now we must scale them so that their sum doesn't exceed * ceiling and yet no non-zero count can become zero. * This scaling prevents errors in the weights of the * interior nodes of the Huffman tree and also ensures that * the codes will fit in an unsigned integer. Rescaling is * used if necessary to limit the code length. */ static INT scale(ceil) unsigned INT ceil; /* upper limit on total weight */ { register INT i; INT ovflw, divisor; unsigned INT w, sum; unsigned INT increased; /* flag */ do { for (i=sum=ovflw=0; i<NUMVALS; ++i) { if (node[i].weight > (ceil-sum)) ++ovflw; sum += node[i].weight; } divisor = ovflw + 1; /* Ensure no non-zero values are lost */ increased = FALSE; for (i=0; i<NUMVALS; ++i) { w = node[i].weight; if (w<divisor && w!=0) { /* Don't fail to provide a code if it's used at all */ node[i].weight = divisor; increased = TRUE; } } } while (increased); /* Scaling factor choosen, now scale */ if (divisor>1) for (i=0; i<NUMVALS; ++i) node[i].weight /= divisor; } /* * heap() and adjust() maintain a list of binary trees as a * heap with the top indexing the binary tree on the list * which has the least weight or, in case of equal weights, * least depth in its longest path. The depth part is not * strictly necessary, but tends to avoid long codes which * might provoke rescaling. */ static INT heap(list,length) INT list[], length; { register INT i; INT adjust(); for (i=(length-2)/2; i>=0; --i) adjust(list,i,length-1); } /* Make a heap from a heap with a new top */ static INT adjust(list,top,bottom) INT list[], top, bottom; { register INT k, temp; INT cmptrees(); k = 2 * top + 1; /* left child of top */ temp = list[top]; /* remember root node of top tree */ if (k<=bottom) { if (k<bottom && cmptrees(list[k],list[k+1])) ++k; /* k indexes "smaller" child (in heap of trees) of top */ /* now make top index "smaller" of old top and smallest child */ if (cmptrees(temp,list[k])) { list[top] = list[k]; list[k] = temp; /* Make the changed list a heap */ adjust(list,k,bottom); /* recursive */ } } } /* * Compare two trees, if a > b return true, else return false. * Note comparison rules in previous comments. */ static INT cmptrees(a,b) INT a, b; /* root nodes of trees */ { if (node[a].weight > node[b].weight) return(TRUE); if (node[a].weight == node[b].weight) if (node[a].tdepth > node[b].tdepth) return(TRUE); return(FALSE); } /* * HUFFMAN ALGORITHM: develops the single element trees * into a single binary tree by forming subtrees rooted in * interior nodes having weights equal to the sum of weights of all * their descendents and having depth counts indicating the * depth of their longest paths. * * When all trees have been formed into a single tree satisfying * the heap property (on weight, with depth as a tie breaker) * then the binary code assigned to a leaf (value to be encoded) * is then the series of left (0) and right (1) * paths leading from the root to the leaf. * Note that trees are removed from the heaped list by * moving the last element over the top element and * reheaping the shorter list. */ static INT bld_tree(list,len) INT list[]; INT len; { register INT freenode; /* next free node in tree */ register struct nd *frnp; /* free node pointer */ INT lch, rch; /* temps for left, right children */ INT maxchar(); /* * Initialize index to next available (non-leaf) node. * Lower numbered nodes correspond to leaves (data values). */ freenode = NUMVALS; while (len>1) { /* * Take from list two btrees with least weight * and build an interior node pointing to them. * This forms a new tree. */ lch = list[0]; /* This one will be left child */ /* delete top (least) tree from the list of trees */ list[0] = list[--len]; adjust(list,0,len-1); /* Take new top (least) tree. Reuse list slot later */ rch = list[0]; /* This one will be right child */ /* * Form new tree from the two least trees using * a free node as root. Put the new tree in the list. */ frnp = &node[freenode]; /* address of next free node */ list[0] = freenode++; /* put at top for now */ frnp->lchild = lch; frnp->rchild = rch; frnp->weight = node[lch].weight + node[rch].weight; frnp->tdepth = 1 + maxchar(node[lch].tdepth, node[rch].tdepth); /* reheap list to get least tree at top */ adjust(list,0,len-1); } dctreehd = list[0]; /* head of final tree */ } static INT maxchar(a,b) { return(a>b ? a : b); } static INT init_enc() { register INT i; for (i=0; i<NUMVALS; ++i) /* Initialize encoding table */ codelen[i] = 0; } /* * Recursive routine to walk the indicated subtree and level * and maintain the current path code in bstree. When a leaf * is found the entire code string and length are put into * the encoding table entry for the leaf's data value . * * Returns ERROR if codes are too long. */ static INT buildenc(level,root) INT level; /* level being examined */ INT root; /* root of subtree/data value if leaf*/ { register INT l, r; l = node[root].lchild; r = node[root].rchild; if (l==NOCHILD && r==NOCHILD) { /* * Leaf. Previous path determines bit string * code of length level (bits 0 to level - 1). * Ensures unused code bits are zero. */ codelen[root] = level; code[root] = tcode & (((unsigned INT)~0) >> (16-level)); return((level>16) ? ERROR : NULL); } else { if (l!=NOCHILD) { /* Clear path bit and continue deeper */ tcode &= ~(1 << level); if (buildenc(level+1,l)==ERROR) return(ERROR); /* pass back bad statuses */ } if (r!=NOCHILD) { /* Set path bit and continue deeper */ tcode |= 1 << level; if (buildenc(level+1,r)==ERROR) return(ERROR); /* pass back bad statuses */ } } return(NULL); /* it worked if we reach here */ } static INT put_int(n,f) /* output an integer */ INT n; /* integer to output */ FILE *f; /* file to put it to */ { putc_pak(n&0xff,f); /* first the low byte */ putc_pak(n>>8,f); /* then the high byte */ } /* Write out the header of the compressed file */ static long wrt_head(ob) FILE *ob; { register INT l,r; INT i, k; INT numnodes; /* # of nodes in simplified tree */ /* * Write out a simplified decoding tree. Only the interior * nodes are written. When a child is a leaf index * (representing a data value) it is recoded as * -(index + 1) to distinguish it from interior indexes * which are recoded as positive indexes in the new tree. * * Note that this tree will be empty for an empty file. */ numnodes = dctreehd<NUMVALS ? 0 : dctreehd-(NUMVALS-1); put_int(numnodes,ob); for (k=0, i=dctreehd; k<numnodes; ++k, --i) { l = node[i].lchild; r = node[i].rchild; l = l<NUMVALS ? -(l+1) : dctreehd-l; r = r<NUMVALS ? -(r+1) : dctreehd-r; put_int(l,ob); put_int(r,ob); } return(sizeof(INT) + numnodes*2*sizeof(INT)); } /* * Get an encoded byte or EOF. Reads from specified stream AS NEEDED. * * There are two unsynchronized bit-byte relationships here. * The input stream bytes are converted to bit strings of * various lengths via the static variables named c... * These bit strings are concatenated without padding to * become the stream of encoded result bytes, which this * function returns one at a time. The EOF (end of file) is * converted to SPEOF for convenience and encoded like any * other input value. True EOF is returned after that. */ static INT gethuff(ib) /* Returns bytes except for EOF */ FILE *ib; { INT rbyte; /* Result byte value */ INT need; /* numbers of bits */ rbyte = 0; need = 8; /* build one byte per call */ /* * Loop to build a byte of encoded data. * Initialization forces read the first time. */ while (1) { if (cbitsrem>=need) /* if current code is big enough */ { if (need==0) return(rbyte); rbyte |= ccode << (8-need);/* take what we need */ ccode >>= need; /* and leave the rest */ cbitsrem -= need; return(rbyte & 0xff); } /* We need more than current code */ if (cbitsrem>0) { rbyte |= ccode << (8-need);/* take what there is */ need -= cbitsrem; } /* No more bits in current code string */ if (curin==SPEOF) { /* * The end of file token has been encoded. If * result byte has data return it and do EOF next time. */ cbitsrem = 0; return((need==8) ? EOF : rbyte + 0); } /* Get an input byte */ if ((curin=getc_ncr(ib)) == EOF) curin = SPEOF; /* convenient for encoding */ ccode = code[curin]; /* get the new byte's code */ cbitsrem = codelen[curin]; } } /* * This routine is used to perform the actual squeeze operation. It can * only be called after the file has been scanned. It returns the true * length of the squeezed entry. */ long file_sq(f,t) /* squeeze a file into an archive */ FILE *f; /* file to squeeze */ FILE *t; /* archive to receive file */ { INT c; /* one byte of squeezed data */ long size; /* size after squeezing */ size = wrt_head(t); /* write out the decode tree */ while ((c=gethuff(f))!=EOF) { putc_pak(c,t); size++; } return(size); /* report true size */ }