home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Usenet 1994 October
/
usenetsourcesnewsgroupsinfomagicoctober1994disk2.iso
/
unix
/
volume25
/
classify
/
classify.c
< prev
next >
Wrap
C/C++ Source or Header
|
1992-02-28
|
17KB
|
564 lines
/*
* `classify': Sort files into groups by content
* Copyright (C) 1991 Mark-Jason Dominus. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 1, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
/* Return codes from `compare()' and macros for handling them. */
#define SAME 1
#define DIFFERENT 0
#define BADFILE1 4
#define BADFILE2 8
#define BADFILEBOTH (BADFILE1 | BADFILE2)
#define ERROR(RC) ((RC != SAME) && (RC != DIFFERENT))
/* Allocate a new object and return a pointer to it. */
#define NEW(type) ((type *) malloc(sizeof(type)))
/* Are strings a and b equal? ('equal', not 'eq') */
#define STREQ(a,b) (strcmp((a),(b)) == 0)
/* Flags set by command-line options. */
int foldflag = 0, blankflag = 0, whiteflag = 0;
/* Format option and codes. */
char formopt = 's';
/* Explanation of data structure used in this program:
Each 'masternode'is a linked list of filenames. The filenames in each
masternode are the names of files that are identical, modulo some
whitespace and upper/lower-case distinctions.
The masternodes are linked together in a linked list called `list'.
Example:
list
|
V data next next
masternode------>filenode------->filenode------>NULL
| | |
| next | data | data
| V V
| filename1 filename2
|
V data next
masternode------>filenode------>NULL
| |
| next | data
| V
V filename3
NULL
This would represent three files: filename1, filename2, and filename3,
of which filename1 and filename2 had the same contents, and filename3
was different from both.
Note: if j is a pointer to a masternode, then j->data->data is the
first filename in j's masternode.
*/
typedef struct s_fnode {
char *data;
struct s_fnode *next;
} filenode;
typedef struct s_mnode {
filenode *data;
struct s_mnode *next;
} masternode;
main(argc,argv)
int argc;
char *argv[];
{
/* Look at these absurd declarations! */
int i, compare(), match, numfileargs, parseargs();
void usage(), mappend(), fappend();
masternode *list, *j, *mnew, *newmasternode();
filenode *fnew, *k, *newfilenode();
FILE *checkit;
/* Didn't anyone ever tell you it wasn't polite to point? */
/* Parse the arguments and obliterate switch options like `-f'. */
numfileargs = parseargs(argc,argv);
/* Anything that survives obliteration is assumed to be a filename. */
/* No, no--what is the good of comparing only one file? */
if (numfileargs < 2) usage(argv[0]);
/* Find the name of the first file on the command line. */
for (i=1; argv[i] == NULL; i++) ;
/* This program has two essentially separate functions.
* One is to take a list of files and group identical ones.
* The other is to see which of files 2...n are identical to
* file 1.
*
* If you specify -m or -M, you get the second
* functionality. Otherwise, you get the first.
*
* What follows right here is the second functionality.
*/
if (formopt == 'm' || formopt == 'M') {
/* The first file the user named is the one to check the others against.
*/
char *master = argv[i];
/* If the user said '-M', echo the name of the master
* file; if not, suppress it. */
if (formopt == 'M')
printf("%s\n",master);
for (i += 1; i<argc; i++) {
if (argv[i] == NULL) continue;
/* If for some reason the name of the master file appears more than
once on the command line, suppress it. (This happens all the time
in shell scripts.)
*/
if (STREQ(argv[i], master)) continue;
match = compare(master, argv[i]);
if (match == SAME)
printf("%s\n", argv[i]);
/* If `match' was an error return, `compare()' printed an error */
/* message for us and we need do nothing special. */
}
exit(0); /* This part of the program can't fail. */
}
/* If we're here, then the user didn't select -m or -M, and we
do the normal thing, which is to look at all the files and sort them
into groups.
*/
/* This next bit of code catches a peculiar bug: If it were not here, and
if we couldn't open the first file named on the command line, we would
put it into the linked list anyway (list->data->data = argv[i]) and
subsequent files would get checked against it, yielding many error
messages, much wasted time, and erroneous output--there would be a
`Class 1' with the bad file alone in it.
Putting in this check allows us to make much simpler
list-initialization code. I hate writing special-case code for
starting off linked lists!
*/
while (((checkit = fopen(argv[i],"r")) == NULL) &&
i < argc)
fprintf(stderr, "Couldn't open file %s.\n", argv[i++]);
fclose(checkit);
if (i == argc) exit(0); /* Couldn't read *any* of the input files. */
/* Initialize linked lists */
list = newmasternode();
list->data->data = argv[i];
/* Wasn't that simple? Told you so. */
for (i += 1; i < argc; i++) { /* Loop through filenames... */
if (argv[i] == NULL) continue; /* ... skipping nulls ... */
match = DIFFERENT;
j=list;
do {
/* ... matching the current file with the file at the head of each */
/* class-list ... */
match = compare(argv[i], j->data->data);
if (match == DIFFERENT) j = j->next;
/* ... until we run out of class lists or find a match or an error. */
} while (j && (match == DIFFERENT));
/* Now, if there was an error, then... */
if (ERROR(match)) {
/* ... I hope it was in the current file--that's no problem; we just
obliterate it from the list of files to check, and move on, but...
*/
if ((match & BADFILE1) == BADFILE1) {
argv[i] = NULL;
continue;
}
/* ... if the problem was with the file in the class list, I am very
upset, because it _was_ okay when I put it into the list.
(I have violated Steinbach's Guideline for Systems Programming:
``Never test for an error condition you don't know how to
handle.'' But actually I could handle this; we could delete the
bogus file from the class-list in which it appears. This is a lot
of work and it will happen only very rarely and in bizarre
circumstances, so I choose not to bother. So sue me.
*/
else if ((match & BADFILE2) == BADFILE2) {
fprintf(stderr,"WARNING:\tSomething went wrong with file %s\n",
j->data->data);
fprintf(stderr,"since the last time I looked at it.\n");
/* Yes, Virginia, this is correct behavior. */
}
}
/* Okay, there was no error, but the current file was *not* like
any of the ones we've seen so far. Make a new classification and
put the current filename into it.
*/
else if (match == DIFFERENT) {
mnew = newmasternode();
mnew->data->data = argv[i];
mappend(list,mnew);
}
/* Ah, we found a match--the current file is identical to the ones in */
/* the classification j->data. */
else {
#ifdef DEBUG
fprintf(stderr, "%s matched %s.\n", argv[i], j->data->data);
#endif
fnew = newfilenode();
fnew->data = argv[i];
fappend(j->data, fnew);
}
} /* for (i += 1; ... ) */
/* We are out of the main loop and all the files have been handled,
one way or another. Now it is time to spit out the output.
*/
/* `formopt' is '1' if the user selected the `-1' option. It means
* that the proram should not do the default thing, which is to make a
* nice long report of who matched whom, but rather should just dump out
* a list of files each of which represents exactly one of the classes. */
if (formopt == '1') {
for (j=list; j; j=j->next)
printf("%s\n", j->data->data);
}
/* `formopt' is 's' if the user selected the '-s' option. That
* means that the program should make a short, awkable kind of
* output, with one line per class, filenames separated by a single
* space. Note that we do not number the lines. (I almost had it
* number the lines.) The idea is that if the user wanted the lines
* numbered, they would pipe the output through 'cat -n'. */
else if (formopt == 's') {
for (j=list; j; j=j->next) {
for (k = j->data; k; k=k->next)
printf("%s ", k->data);
printf("\n");
}
}
/* Here we make the nice long report. The temptation to add many
bells and whistles and have the program accept a format-specification
string and so on is very tempting, but I will not give in to foolish
creeping featurism. At least, not any more than I already have.
Actually, a short-form option, the puts the output in the form
1 foo.c bar.c baz.c la.c
2 la de da oakum yokum
3 cruft FOCUS
4 adventure
might be very useful, because as it is you can't really feed this
program's output to AWK in a reasonable way.
*/
/* Note added in proof: I gave in to creeping featurism. See the
* '-s' option. Sigh. At least I did not make it number the lines. */
else {
for (j=list, i=1; j; j=j->next, i++) {
printf("\nClass %d:\n",i);
for (k = j->data; k; k=k->next) {
printf("\t%s\n",k->data);
}
}
}
exit(0); /* Au 'voir! */
}
/* This next `compare' routine is what I used to do, but there are good
reasons for not using either diff(1) or cmp(1):
1. Do not use diff(1) because it is too intelligent (intelligent ->
slow.) Diff tells you where the files differ and that is not what we
want--we just want to know if they are different or not.
2. Do not use cmp(1) because we want to use this program for comparing
things like /etc/rc.local and /etc/motd which are very likely to differ
only in a few whitespaces, and we want this program to report that such
files are identical, even though cmp says they're not.
Maybe UNIX needs a nice, simple, flexible file-compare utility? Naah,
you can always string awk and sed and things onto the front of cmp. But
that's too slow for us here.
*/
/* Do not do this:
int
compare(path1,path2)
char *path1, *path2;
{
char compare[1024];
sprintf(compare,"cmp -s %s %s",path1,path2);
sprintf(compare,"diff -w %s %s > /dev/null 2>&1",path1,path2);
return((system(compare) >> 8 == 0) ? SAME : DIFFERENT );
}
*/
/* So this is what we do instead. */
int
compare(path1, path2)
char *path1, *path2;
{
FILE *file1, *file2;
int c1,c2;
if ((file1 = fopen(path1,"r")) == NULL) {
fprintf(stderr, "Couldn't open file %s.\n", path1);
return(BADFILE1);
}
if ((file2 = fopen(path2,"r")) == NULL) {
fprintf(stderr, "Couldn't open file %s.\n", path2);
return(BADFILE2); /* For symmetry, even though this program will become
quite irate if `compare' ever returns this code.
*/
}
do {
do {
c1 = getc(file1);
/* You may need to make a Karnaugh map to understand this termination
condition, but it essentially means to ignore the right white spaces
if the right option flags are set, and I have tested it for you,
so you may assume it is doing the thing that the man page says it
does.
*/
} while (! ((!blankflag && !whiteflag) ||
((c1 != ' ' && c1 != '\t') && (c1 != '\n' || !whiteflag)))
) ;
do {
c2 = getc(file2);
} while (! ((!blankflag && !whiteflag) || /* Ditto */
((c2 != ' ' && c2 != '\t') && (c2 != '\n' || !whiteflag)))
) ;
/* Fold case if requested with `-f' flag. */
if (foldflag) {
c1 = (isupper(c1) ? tolower(c1) : c1);
c2 = (isupper(c2) ? tolower(c2) : c2);
}
if (c1 != c2) {
fclose(file1);
fclose(file2);
return DIFFERENT;
}
} while (c1 != EOF && c2 != EOF);
fclose(file1);
fclose(file2);
/* If we're here, then both files were identical and we tapped out at */
/* least one of them. If we tapped out both, they really are identical. */
/* If, on the other hand, only one is finished, then it is a strict */
/* prefix of the other and so the two files are *not* the same. */
if (c1 == EOF && c2 == EOF)
return SAME;
else
return DIFFERENT;
}
/* Nyahh nyah! User is a big stupid-head! */
void
usage(progname)
char *progname;
{
char *tail;
tail = strrchr(progname,'/');
if (tail) progname = tail+1;
fprintf(stderr,"Usage:\t %s [-1 | -s | -l | -m | -M] [-f] [-b | -w]\n",progname);
fprintf(stderr,"\tfile1 file2 [...]\n");
fprintf(stderr,"\n\nTry %s -h\t for help.\n", progname);
exit(-1);
}
/* I put this here 'cause I didn't want to write a man page. Duuhhhhh. */
void
help()
{
fprintf(stderr,"Classify: Examine and group identical files.\n\n");
fprintf(stderr,"Flags:\n\t-f\tFold case in file comparisions.\n");
fprintf(stderr,"\t-b\tIgnore blanks and TABs in file comparisions.\n");
fprintf(stderr,"\t-w\tIgnore all whitespace in file comparisions.\n");
fprintf(stderr,"\t-1\tPrint the name of only one file from each class.\n");
fprintf(stderr,"\t-l\tPrint long-format output (default).\n");
fprintf(stderr,"\t-s\tPrint short-format output.\n");
fprintf(stderr,"\t-M\tPrint only names of files that match first file named.\n");
fprintf(stderr,"\t-m\tLike -M, but suppress first filename.\n");
return;
}
/* Parse the args and set the flags.
We want the argument list to be free-form so you can mix filenames and
options. That is because I am a masochist. So to save trouble, we just
obliterate the flag arguments by setting them to NULL, and then we have
the main routine ignore NULL arguments if it sees any. Programmers who
say `but then you can't tell when you've reached the end of the arg list
because it is supposed to be a NULL-terminated array!' get a boot to the
head.
Returns the number of non-flag arguments.
*/
int
parseargs(argc,argv)
int argc;
char *argv[];
{
int i, j, numnonflags = argc-1;
void usage(), help();
for (i=1; i<argc; i++) {
if (argv[i][0] != '-') continue;
numnonflags -= 1;
if (argv[i][1] == '\0') { /* If flag is "-", stop parsing args */
/* Probably `-' should mean to read the stdin. I will put in
that feature three days after next tishabov.
(Translation for gentiles: I will put the feature in on the
fourth Thursday of next week. )
*/
argv[i] = NULL;
return numnonflags;
}
for (j=1; argv[i][j]; j++) {
switch (argv[i][j]) {
case '-': /* If flag is "--", stop parsing args */
if (j==1) {
argv[i] = NULL;
return;
} /* Else we got a flag like -f-w, so ignore the second "-" sign. */
break;
case 'f':
foldflag = 1;
break;
case 'b':
blankflag = 1;
break;
case 'w':
whiteflag = 1;
break;
case 'l':
formopt = 'l';
break;
case 's':
formopt = 's';
break;
case '1':
formopt = '1';
break;
case 'h':
help(); /* ``Why does this function return?''
`` `Cause you're an idiot.''
``Oh yeah. I forgot.''
*/
exit(0);
break;
case 'm':
formopt = 'm';
break;
case 'M':
formopt = 'M';
break;
default:
fprintf(stderr, "Unknown option: -%c.\n", argv[i][j]);
usage(argv[0]);
}
}
if (argv[i][0] == '-') argv[i] = NULL; /* Obliterate flag arguments. */
}
return numnonflags;
}
/* Manufacture a new masternode whose car is a new filenode. Return a */
/* pointer to the new masternode. */
masternode *
newmasternode()
{
masternode *foo;
filenode *newfilenode();
foo = NEW(masternode);
foo->next = NULL;
foo->data = newfilenode();
return(foo);
}
/* Manufacture a new filenode whose car is the null string. Return a */
/* pointer to the new filenode. */
filenode *
newfilenode()
{
filenode *foo;
foo = NEW(filenode);
foo->next = NULL;
foo->data = NULL;
return(foo);
}
/* head and tail are pointers to masternodes. (i.e., they are linked lists */
/* of masternodes.) Append tail to the end of head. (LISP pepole would */
/* call this operation `nconc'. I can't say the word `nconc' without */
/* bursting out laughing, so I called it `mappend' instead.) */
void
mappend(head,tail)
masternode *head, *tail;
{
masternode *i;
/* Find the end of the linked list `head' */
for (i=head; i->next; i = i->next) ;
/* Concatenate. */
i->next = tail;
return;
}
/* This is the same as mappend, except it works on filenode-lists instead */
/* of masternode-lists. Big deal. */
void
fappend(head,tail)
filenode *head, *tail;
{
filenode *i;
for (i=head; i->next; i = i->next) ;
/* nconc! nconc! nconc! hahahaha! */
i->next = tail;
return;
}