home *** CD-ROM | disk | FTP | other *** search
- Subject: v16i052: Remove duplicates from a "bib" database
- Newsgroups: comp.sources.unix
- Sender: sources
- Approved: rsalz@uunet.UU.NET
-
- Submitted-by: Brain in Neutral <bin@rhesus.primate.wisc.edu>
- Posting-number: Volume 16, Issue 52
- Archive-name: uniqbib
-
- Uniqbib is used in conjunction with the suite of programs comprising the
- refer system. One problem with this system is that there is no provision
- for removal of duplicates from bibliographic databases. (Duplicates may
- occur, for instance, if the output from two or more lookbib queries is
- combined, where entries satisfy the keywords of more than one search.)
- Uniqbib is used to remove such duplicates. It reads its input and writes
- all the unique entries therein to the standard output. The entries in the
- input do not have to be sorted (i.e., you do not have to run sortbib
- first), nor do the entries produced by uniqbib come out in any particular
- order.
-
- #!/bin/sh
- # shar: Shell Archiver
- # Run the following text with /bin/sh to create:
- # uniqbib.c
- # bibinfo.h
- # BibAlloc.c
- # BibCmp.c
- # BibRead.c
- # BibWrite.c
- # panic.c
- # uniqbib.1
- # Makefile
- # bad1
- # bad2
- # bad3
- echo shar: extracting uniqbib.c '(7434 characters)'
- sed 's/^XX//' << \SHAR_EOF > uniqbib.c
- XX/*
- XX uniqbib - uniq a bibliographic database to eliminate duplicates.
- XX
- XX syntax: uniqbib [ -v ] [ file ]
- XX
- XX -v means verbose: prints feedback.
- XX
- XX Only one file may be explicitly named (use cat f1 f2 ... | uniqbib
- XX for > 1 file).
- XX
- XX Strategy:
- XX Read input file, writing file position and summary information
- XX for each entry. Sort on summary info. For each set of entries
- XX having identical summary information, compare directly, and write
- XX out those that are different. (For those entries that have unique
- XX summary info, the entry is unique, of course.)
- XX
- XX The summary information is trivial: the sum of the characters in
- XX the text of the entry. This is invariant with respect to order of
- XX fields (records that are identical except for order of fields get
- XX identical checksums this way).
- XX
- XX 7 April 1988 Paul DuBois dubois@rhesus.primate.wisc.edu
- XX
- XX Change History:
- XX
- XX 08/10/88 Fixed failure to check for citations that are too long.
- XX Added -v flag. Put better error messages in BERead
- XX (prints text of citation seen so far, input line number
- XX where error occurred). Removed inability to uniq from
- XX a pipe or redirected input (citations are copied to
- XX holding file during reading phase 1).
- XX*/
- XX
- XX# include <stdio.h>
- XX# include <signal.h>
- XX# include "bibinfo.h"
- XX
- XX
- XXint count = 0;
- XXint verbose = 0;
- XX
- XXint cleanup ();
- XXint onintr ();
- XX
- XXchar keyFile[BUFSIZ];
- XXchar holdFile[BUFSIZ];
- XX
- XX
- XXmain (argc, argv)
- XXint argc;
- XXchar **argv;
- XX{
- XXBibEnt *be;
- XXBibFld *bf;
- XXFILE *kf, *hf;
- XXchar buf[BUFSIZ];
- XXchar *cp;
- XXlong sum;
- XXint len;
- XXint result;
- XXint empty = 1;
- XXint useHold = 0;
- XXlong holdPos = 0;
- XX
- XX/*
- XX Arrange for cleanup of temp files in
- XX the event of abnormal termination.
- XX*/
- XX
- XX setpanichook (cleanup);
- XX signal (SIGINT, onintr);
- XX signal (SIGHUP, onintr);
- XX signal (SIGQUIT, onintr);
- XX signal (SIGTERM, onintr);
- XX
- XX/*
- XX Process arguments and set up files. If a file is named, use
- XX that for input, else read whatever's on stdin and arrange to
- XX hold a copy in a seekable temp file.
- XX*/
- XX
- XX --argc;
- XX ++argv;
- XX if (argc > 0 && strcmp (*argv, "-v") == 0)
- XX {
- XX ++verbose;
- XX --argc;
- XX ++argv;
- XX }
- XX if (argc == 0) /* reading from pipe or redirection */
- XX {
- XX sprintf (holdFile, "/tmp/ubh%05d", getpid ());
- XX if ((hf = fopen (holdFile, "w")) == NULL)
- XX panic ("Can't open temporary hold file");
- XX ++useHold;
- XX }
- XX else if (argc == 1)
- XX {
- XX if (freopen (argv[0], "r", stdin) == NULL)
- XX panic ("Can't open: %s", argv[0]);
- XX }
- XX else
- XX panic ("Usage: uniqbib [-v] file");
- XX
- XX sprintf (keyFile, "/tmp/ub%05d", getpid ());
- XX if ((kf = fopen (keyFile, "w")) == NULL)
- XX panic ("Can't open temporary key file.");
- XX
- XX/*
- XX Ready to make first pass through input. Read each citation,
- XX compute the key and write out the key and file position of the
- XX citation to the key file. If reading a pipe or redirected input
- XX write out the citation to a holding file so can re-read it. In
- XX that case, the file position must be fixed, since the position
- XX in the holding file may well be different (e.g., if there are
- XX multiple blank lines between citations in the original input).
- XX*/
- XX
- XX if ((be = BEAlloc ()) == NULL)
- XX panic ("Out of memory.");
- XX if (verbose)
- XX fprintf (stderr, "Reading citations\n");
- XX while ((result = BERead (stdin, be)) > 0)
- XX {
- XX ++count;
- XX if (verbose)
- XX fprintf (stderr, ".");
- XX empty = 0;
- XX sum = 0;
- XX cp = BEText (be);
- XX len = BELen (be);
- XX if (useHold)
- XX {
- XX BEWrite (hf, be);
- XX BEFilPos (be) = holdPos;
- XX holdPos += len + 1; /* +1 for preceding newline */
- XX }
- XX while (len-- > 0)
- XX sum += *cp++;
- XX fprintf (kf, "%D %D\n", sum, BEFilPos (be));
- XX }
- XX fclose (kf);
- XX if (useHold) /* if using hold file, close and attach to stdin */
- XX {
- XX fclose (hf);
- XX if (freopen (holdFile, "r", stdin) == NULL)
- XX panic ("Can't reopen hold file.");
- XX }
- XX if (result < 0 || empty)
- XX {
- XX cleanup ();
- XX exit (0);
- XX }
- XX if (verbose)
- XX fprintf (stderr, "\nPass 1 done (%d citations)\n", count);
- XX
- XX/*
- XX Pass two. Sort the keys so duplicates will cluster, and uniq
- XX them.
- XX*/
- XX
- XX if (verbose)
- XX fprintf (stderr, "Sorting keys\n");
- XX sprintf (buf, "exec sort -o %s %s", keyFile, keyFile);
- XX system (buf);
- XX if (verbose)
- XX fprintf (stderr, "Sort done\n");
- XX if ((kf = fopen (keyFile, "r")) == NULL)
- XX panic ("Can't reopen key file.");
- XX UniqBib (kf);
- XX fclose (kf);
- XX if (verbose)
- XX fprintf (stderr, "\nDone\n");
- XX cleanup ();
- XX exit (0);
- XX}
- XX
- XX
- XX
- XX/*
- XX The ugly heart of the program.
- XX
- XX Read checksum/file-position pairs from f. It's sorted on checksum,
- XX so that all groups of identical checksums will cluster. The
- XX bib entries within each cluster may or *may not* be content-identical,
- XX so the algorithm hangs onto each entry until it either knows it's
- XX unique or that it's a dup, as follows:
- XX
- XX First read one line and hold onto it for a reference. Then read rest
- XX of lines. If checksum is different, flush the reference and restart
- XX with the next line after the reference as the new reference. If the
- XX checksum is the same, then do a direct compare of the bib entries
- XX themselves. If they're the same, skip the reference and restart with
- XX the next line after the reference as the new reference. If they are
- XX different, just keep reading. (Eventually one will be found that's
- XX either a different checksum or identical, or EOF will be reached, so
- XX the reference can be either flushed or skipped.)
- XX
- XX When restarting so that the reference is bounced to the next
- XX line in the summary file, check first whether the comparison
- XX is that line. If so, don't bother to reread that line or to
- XX fetch the bibliographic entry itself. Since except in perverse
- XX cases the comparison almost always becomes the next reference, this
- XX is a big win.
- XX
- XX*/
- XX
- XXUniqBib (f)
- XXFILE *f;
- XX{
- XXlong refPos, comPos; /* reference and comparison seek positions */
- XXlong refCkSum, comCkSum; /* reference and comparison checksums */
- XXint getNextRef = 1; /* non-zero if need to read ref sum, pos */
- XXlong refOff = 0, comOff; /* offset of line after reference, comparison */
- XXint nCompares = 0; /* number of comparisons made with ref */
- XXBibEnt b1, *beRef = &b1;
- XXBibEnt b2, *beCom = &b2;
- XXBibEnt *beTmp;
- XXint nondups = 0;
- XX
- XX if (verbose)
- XX fprintf (stderr, "Comparing keys\n");
- XX for (;;)
- XX {
- XX if (verbose)
- XX fprintf (stderr, ".");
- XX if (getNextRef)
- XX {
- XX getNextRef = 0;
- XX if (nCompares == 1) /* make comparison next ref */
- XX {
- XX refCkSum = comCkSum;
- XX refPos = comPos;
- XX refOff = comOff;
- XX beTmp = beRef;
- XX beRef = beCom;
- XX beCom = beTmp;
- XX }
- XX else /* seek to correct spot, read summary */
- XX { /* info and entry from bib file */
- XX fseek (f, refOff, 0);
- XX if (fscanf (f, "%D %D\n", &refCkSum, &refPos)
- XX != 2)
- XX break; /* no more refs in file */
- XX refOff = ftell (f);
- XX fseek (stdin, refPos, 0);
- XX if (!BERead (stdin, beRef))
- XX panic ("Can't read reference entry.");
- XX }
- XX nCompares = 0;
- XX }
- XX if (fscanf (f, "%D %D\n", &comCkSum, &comPos) != 2)
- XX {
- XX BEWrite (stdout, beRef); /* flush reference */
- XX ++nondups;
- XX ++getNextRef;
- XX continue;
- XX }
- XX comOff = ftell (f);
- XX fseek (stdin, comPos, 0);
- XX if (!BERead (stdin, beCom))
- XX panic ("Can't read comparison entry.");
- XX ++nCompares;
- XX if (refCkSum != comCkSum) /* different - flush */
- XX {
- XX BEWrite (stdout, beRef); /* current reference */
- XX ++nondups;
- XX }
- XX else if (BECmp (beRef, beCom)) /* compare directly, */
- XX continue; /* skip ref if diff */
- XX ++getNextRef;
- XX }
- XX fprintf (stderr, "%d citations (%d + %d duplicates)\n",
- XX count, nondups, count-nondups);
- XX}
- XX
- XX
- XXcleanup ()
- XX{
- XX (void) unlink (keyFile);
- XX (void) unlink (holdFile);
- XX}
- XX
- XXonintr () { panic ("\nInterrupted..."); }
- SHAR_EOF
- if test 7434 -ne "`wc -c uniqbib.c`"
- then
- echo shar: error transmitting uniqbib.c '(should have been 7434 characters)'
- fi
- echo shar: extracting bibinfo.h '(2596 characters)'
- sed 's/^XX//' << \SHAR_EOF > bibinfo.h
- XX/*
- XX bibinfo.h - constants, types and variables to support operations
- XX on bibliography entries.
- XX
- XX The BibEnt type is the basic unit of bibliographic information.
- XX It contains a buffer holding the text of the entry (unmodified),
- XX and structures which are set up when the entry is read in from
- XX a file. These structures allow easy access to the various fields
- XX of the entry. They also allow quick checks as to whether a field
- XX is present in the entry, how many authors there are, and so forth.
- XX
- XX Macros and functions are used to access parts of records. This
- XX allows the underlying implementation to change while preserving
- XX the programming interface. Currently the implementation is
- XX as follows:
- XX
- XX Lines of a bibliography entry are read one after the other
- XX into a buffer. As each is read, the lines are categorized and
- XX information set up to allow each field to be accessed easily.
- XX For each field (except authors), one BibFld struct is set up.
- XX bfStart is the offset into the text buffer of the beginning of the
- XX field (starts at the % itself), and bfLen is the length of the
- XX field (includes length of all continuation lines). Authors are
- XX anomalous, since there may be many of them; BibFld's for these are
- XX stored at the end of the regular field array, and there is a count
- XX variable.
- XX*/
- XX
- XX# ifndef NULL
- XX# define NULL (0)
- XX# endif
- XX
- XX
- XX# define beMaxBuf 1024 /* max size of entry's text */
- XX# define beMaxAuth 20 /* max authors allowed */
- XX
- XX
- XXtypedef struct
- XX{
- XX int bfStart; /* index of start of bib field text */
- XX int bfLen; /* length of bib field text */
- XX} BibFld;
- XX
- XX
- XXtypedef struct
- XX{
- XX int beTextLen; /* length of entry text */
- XX char beText[beMaxBuf]; /* entry text */
- XX BibFld beFld[26+beMaxAuth]; /* field information */
- XX int beAuthCnt; /* number of authors */
- XX long beFilPos; /* starting position in file */
- XX} BibEnt;
- XX
- XX
- XX/*
- XX Macros for accessing various pieces of the bibliographic
- XX entry. Most of these can be used as lvalues. Arguments
- XX should be as follows:
- XX
- XX BibEnt *be
- XX BibFld *bf
- XX char f ('B'..'Z')
- XX int n (0..BEAuthCnt(be)-1)
- XX
- XX*/
- XX
- XX# define BEText(be) (be->beText)
- XX# define BELen(be) (be->beTextLen)
- XX# define BEFilPos(be) (be->beFilPos)
- XX
- XX# define BEHaveFld(be,f) (BEFldLen(be,f) != 0)
- XX# define BEFldPtr(be,f) (&(be->beFld[(f)-'A']))
- XX
- XX# define BEFldStart(bf) (bf->bfStart)
- XX# define BEFldLen(bf) (bf->bfLen)
- XX# define BEFldText(be,bf) (&(BEText(be)[BEFldStart(bf)]))
- XX
- XX# define BEAuthPtr(be,n) (BEFldPtr(be,'Z'+n+1))
- XX# define BEAuthLen(be,n) (BEFldLen(be,'Z'+n+1))
- XX# define BEAuthText(be,n) (BEFldText(be,'Z'+n+1))
- XX
- XX# define BEAuthCnt(be) (be->beAuthCnt)
- XX
- XX
- XXBibEnt *BEAlloc ();
- SHAR_EOF
- if test 2596 -ne "`wc -c bibinfo.h`"
- then
- echo shar: error transmitting bibinfo.h '(should have been 2596 characters)'
- fi
- echo shar: extracting BibAlloc.c '(205 characters)'
- sed 's/^XX//' << \SHAR_EOF > BibAlloc.c
- XX/*
- XX BibEnt structure allocation and disposal
- XX*/
- XX
- XX# include "bibinfo.h"
- XX
- XXBibEnt *BEAlloc ()
- XX{
- XXchar *calloc ();
- XX
- XX return ((BibEnt *) calloc (1, sizeof (BibEnt)));
- XX}
- XX
- XX
- XXBEFree (bp)
- XXBibEnt *bp;
- XX{
- XX free (bp);
- XX}
- SHAR_EOF
- if test 205 -ne "`wc -c BibAlloc.c`"
- then
- echo shar: error transmitting BibAlloc.c '(should have been 205 characters)'
- fi
- echo shar: extracting BibCmp.c '(1623 characters)'
- sed 's/^XX//' << \SHAR_EOF > BibCmp.c
- XX/*
- XX Compare two bibliographic entries for equality. They do
- XX not need to have the fields in the same order (except that
- XX authors must be in the same order), just the same fields
- XX present, and the same information in corresponding fields.
- XX
- XX Returns:
- XX
- XX 0 b1 = b2
- XX !0 b1 != b2
- XX
- XX This should be extended to allow comparison to return ordering
- XX information by passing in sort ordering information.
- XX
- XX First compare the number of authors.
- XX If equal, compare the authors by length and text.
- XX If equal, see if the same fields are present and have the same length
- XX and the same text.
- XX If equal, they're equal.
- XX Otherwise, not.
- XX*/
- XX
- XX# include <stdio.h>
- XX# include "bibinfo.h"
- XX
- XX# undef DEBUG
- XX# ifdef DEBUG
- XX# define RETURN(x,y) return (printf ("%s\n", x) ? y : y)
- XX# else
- XX# define RETURN(x,y) return (y)
- XX# endif
- XX
- XXBECmp (b1, b2)
- XXBibEnt *b1, *b2;
- XX{
- XXint i, len, n;
- XXBibFld *bfp1, *bfp2;
- XX
- XX if ((n = BEAuthCnt (b1)) != BEAuthCnt (b2))
- XX RETURN ("author count", 1);
- XX for (i = 0; i < n; i++)
- XX {
- XX bfp1 = BEAuthPtr (b1, i);
- XX bfp2 = BEAuthPtr (b2, i);
- XX if ((len = BEFldLen (bfp1)) != BEFldLen (bfp2))
- XX RETURN ("author length", 1);
- XX if (strncmp (BEFldText (b1, bfp1), BEFldText (b2, bfp2), len))
- XX RETURN ("author text", 1);
- XX }
- XX for (i = 'B'; i <= 'Z'; i++)
- XX {
- XX/*
- XX Don't have to check whether fields are present or not, since
- XX missing ones have length zero and will compare properly.
- XX*/
- XX bfp1 = BEFldPtr (b1, i);
- XX bfp2 = BEFldPtr (b2, i);
- XX if ((len = BEFldLen (bfp1)) != BEFldLen (bfp2))
- XX RETURN ("field length", 1);
- XX if (strncmp (BEFldText (b1, bfp1), BEFldText (b2, bfp2), len))
- XX RETURN ("field text", 1);
- XX }
- XX RETURN ("same", 0);
- XX}
- SHAR_EOF
- if test 1623 -ne "`wc -c BibCmp.c`"
- then
- echo shar: error transmitting BibCmp.c '(should have been 1623 characters)'
- fi
- echo shar: extracting BibRead.c '(2701 characters)'
- sed 's/^XX//' << \SHAR_EOF > BibRead.c
- XX/*
- XX Read a bibliographic entry from a file.
- XX
- XX Read until have a non-blank line, then read until a blank line or
- XX EOF. Return number of lines in citation. (Zero means EOF, -1 error).
- XX
- XX Error are returned for various conditions:
- XX
- XX text of citation too long
- XX more than beMaxAuth authors
- XX first line does not begin with %
- XX % lines with non-capital letter following %
- XX*/
- XX
- XX# include <stdio.h>
- XX# include <ctype.h>
- XX# include <varargs.h>
- XX# include "bibinfo.h"
- XX
- XX
- XXstatic long line = 0;
- XXstatic long citation = 0;
- XX
- XX
- XXBERead (f, bi)
- XXFILE *f;
- XXBibEnt *bi;
- XX{
- XXBibFld *bf = NULL;
- XXchar *bp = BEText (bi);
- XXint left = sizeof (BEText (bi));
- XXint read = 1;
- XXint nLines = 0;
- XXint i, len;
- XXint c;
- XX
- XX/*
- XX Initialize entry to "nothing"
- XX*/
- XX
- XX BELen (bi) = 0; /* no text */
- XX BEAuthCnt (bi) = 0; /* no authors */
- XX for (i = 'A'; i <= 'Z'; i++) /* no other fields, either */
- XX BEFldLen (BEFldPtr (bi, i)) = 0;
- XX
- XX ++citation;
- XX
- XX while (read)
- XX {
- XX ++line;
- XX if ((c = getc (f)) == EOF)
- XX break;
- XX if (c == '\n') /* blank line */
- XX {
- XX read = (nLines == 0); /* if haven't seen ref, keep */
- XX continue; /* reading, else skip line */
- XX }
- XX if (++nLines == 1) /* set pos on first line */
- XX {
- XX BEFilPos (bi) = ftell (f) - 1;
- XX if (c != '%') /* make sure not cont. line */
- XX {
- XX BRError (bi, "First line in citation does not begin with '%%':\n%s\n", bp);
- XX return (-1);
- XX }
- XX }
- XX for (len = 0; left-- > 0; ) /* read line into buffer */
- XX {
- XX bp[len++] = c;
- XX if (c == '\n') /* have full line now */
- XX break;
- XX if ((c = getc (f)) == EOF)
- XX {
- XX read = 0; /* done reading */
- XX break;
- XX }
- XX }
- XX BELen (bi) += len;
- XX if (left < 0)
- XX {
- XX BRError (bi, "Citation too long, exceeds %d characters\n", sizeof (BEText (bi)));
- XX return (-1);
- XX }
- XX if (*bp == '%') /* start of new field */
- XX {
- XX if (!isupper (bp[1]))
- XX {
- XX BRError (bi, "Bad key line\n");
- XX return (-1);
- XX }
- XX /*bp += 3; /* skip to beginning of text */
- XX /*len -= 3; /* adjust length of rest of line */
- XX if (bp[1] != 'A')
- XX bf = BEFldPtr (bi, bp[1]);
- XX else if (BEAuthCnt (bi) >= beMaxAuth)
- XX {
- XX BRError (bi, "Too many authors, only %d allowed.\n", beMaxAuth);
- XX return (-1);
- XX }
- XX else
- XX bf = BEAuthPtr (bi, BEAuthCnt (bi)++);
- XX
- XX BEFldStart (bf) = bp - BEText (bi);
- XX BEFldLen (bf) = 0;
- XX }
- XX BEFldLen (bf) += len;
- XX bp += len;
- XX }
- XX
- XX return (nLines);
- XX}
- XX
- XX
- XX
- XXstatic BRError (va_alist)
- XXva_dcl
- XX{
- XXva_list args;
- XXBibEnt *bi;
- XXchar *fmt;
- XX
- XX va_start (args);
- XX bi = va_arg (args, BibEnt *);
- XX fmt = va_arg (args, char *);
- XX fprintf (stderr, "Error at line %D (in citation %D)\n", line, citation);
- XX vfprintf (stderr, fmt, args);
- XX if (BELen (bi) > 0)
- XX {
- XX fprintf (stderr, "Text of entry up to error:\n");
- XX BEWrite (stderr, bi);
- XX }
- XX va_end (args);
- XX}
- SHAR_EOF
- if test 2701 -ne "`wc -c BibRead.c`"
- then
- echo shar: error transmitting BibRead.c '(should have been 2701 characters)'
- fi
- echo shar: extracting BibWrite.c '(265 characters)'
- sed 's/^XX//' << \SHAR_EOF > BibWrite.c
- XX/*
- XX Write a bibliographic entry to a file. This is simple; just
- XX write a newline followed by the text of the entry.
- XX*/
- XX
- XX# include <stdio.h>
- XX# include "bibinfo.h"
- XX
- XX
- XXBEWrite (f, be)
- XXFILE *f;
- XXBibEnt *be;
- XX{
- XX fputc ('\n', f);
- XX fwrite (BEText (be), 1, BELen (be), f);
- XX}
- SHAR_EOF
- if test 265 -ne "`wc -c BibWrite.c`"
- then
- echo shar: error transmitting BibWrite.c '(should have been 265 characters)'
- fi
- echo shar: extracting panic.c '(670 characters)'
- sed 's/^XX//' << \SHAR_EOF > panic.c
- XX/*
- XX panic - print message and die with status 1. Uses vprintf
- XX so that panic can take variable argument lists.
- XX
- XX setpanichook - install function to be called with no arguments
- XX after printing panic message and before exiting. Can be used
- XX for additional cleanup, like removing temp files, etc.
- XX*/
- XX
- XX# include <stdio.h>
- XX# include <varargs.h>
- XX
- XX
- XXstatic int (*panichook) () = NULL;
- XX
- XXvoid setpanichook (p)
- XXint (*p) ();
- XX{
- XX panichook = p;
- XX}
- XX
- XX
- XXvoid
- XXpanic (va_alist)
- XXva_dcl
- XX{
- XXva_list args;
- XXchar *fmt;
- XX
- XX va_start (args);
- XX fmt = va_arg (args, char *);
- XX vfprintf (stderr, fmt, args);
- XX va_end (args);
- XX fprintf (stderr, "\n");
- XX if (panichook != NULL)
- XX (*panichook) ();
- XX exit (1);
- XX}
- SHAR_EOF
- if test 670 -ne "`wc -c panic.c`"
- then
- echo shar: error transmitting panic.c '(should have been 670 characters)'
- fi
- echo shar: extracting uniqbib.1 '(2518 characters)'
- sed 's/^XX//' << \SHAR_EOF > uniqbib.1
- XX.\" xroff -man.new % | lpr
- XX.TH uniqbib 1
- XX.UC 4
- XX.SH NAME
- XXuniqbib \- remove duplicates from bibliographic database
- XX.SH SYNTAX
- XX.B uniqbib
- XX[
- XX.B \-v
- XX] [
- XX.I file
- XX]
- XX.SH DESCRIPTION
- XX.I Uniqbib
- XXis used in conjunction with the suite of programs comprising the
- XX.I refer
- XXsystem.
- XXOne problem with this system is that there is no provision for removal
- XXof duplicates from bibliographic databases.
- XX(Duplicates may occur, for instance, if the output from two or more
- XX.I lookbib
- XXqueries is combined, where entries
- XXsatisfy the keywords of more than one search.)
- XX.I Uniqbib
- XXis used to remove such duplicates.
- XXIt reads its input and writes all the unique entries therein to the
- XXstandard output.
- XXThe entries in the input do not have to be sorted (i.e., you do not have to
- XXrun
- XX.I sortbib
- XXfirst), nor do the entries produced by
- XX.I uniqbib
- XXcome out in any particular order.
- XX.PP
- XXIf a file is named,
- XX.I uniqbib
- XXreads it; otherwise it reads its standard input.
- XXThis means that
- XX.I uniqbib
- XXcan be used in pipelines.
- XXIf more than one file is to be processed, use
- XX.PP
- XX.nf
- XX.in +.5i
- XXcat input1 input2 ... input\f2n\f1 | uniqbib > output
- XX.in -.5i
- XX.fi
- XX.PP
- XXEntries are considered duplicates if
- XX(i) they contain the same authors, in the same order; and (ii)
- XXthey contain the same non-author fields, and the content of corresponding
- XXfields is identical.
- XX.PP
- XXFields (except for authors) do
- XX.I not
- XXhave to appear in the same order for two entries to be considered identical.
- XXFor example, the following two entries are the same as far as
- XX.I uniqbib
- XXis concerned, because within the entries, the authors appear in the same
- XXorder, and the other fields are identical with respect to content, even
- XXthough they appear in different orders.
- XX.PP
- XX.nf
- XX.in +.5i
- XX.ta 2.5i
- XX%A First Author %T My title
- XX%T My title %J Some journal
- XX%A Second Author %A First Author
- XX%D 1988 %A Second Author
- XX%N 4 %D 1988
- XX%J Some journal %V 198
- XX%V 198 %N 4
- XX.in -.5i
- XX.fi
- XX.PP
- XXThe following entries are
- XX.I not
- XXidentical:
- XX.PP
- XX.nf
- XX.in +.5i
- XX.ta 2.5i
- XX%A First Author %A Second author
- XX%A Second Author %A First Author
- XX%T My title %T My title
- XX%J Some journal %J Some journal
- XX%V 198 %V 198
- XX%N 4 %N 4
- XX%D 1988 %D 1988
- XX.in -.5i
- XX.fi
- XX.PP
- XXIf the
- XX.B \-v
- XXflag is given,
- XX.I uniqbib
- XXgoes into verbose mode;
- XXit announces the beginning and end of each phase of analysis, and prints
- XXa ``.'' as each citation is read during the initial and comparison phases.
- XX.SH "SEE ALSO"
- XXlookbib(1), refer(1), sortbib(1)
- XX.SH "WHO-TO-BLAME"
- XXPaul DuBois, dubois@rhesus.primate.wisc.edu
- XX.SH BUGS
- XXShould probably be named ``snail.''
- SHAR_EOF
- if test 2518 -ne "`wc -c uniqbib.1`"
- then
- echo shar: error transmitting uniqbib.1 '(should have been 2518 characters)'
- fi
- echo shar: extracting Makefile '(841 characters)'
- sed 's/^XX//' << \SHAR_EOF > Makefile
- XXBINDIR=/usr/local
- XXLIBS=biblib
- XXINSTALL= -c -m 755 -o bin -g system
- XXTROFF=xroff
- XXMANMACROS=man.new
- XXLOBJS=\
- XX BibAlloc.o\
- XX BibCmp.o\
- XX BibRead.o\
- XX BibWrite.o\
- XX panic.o
- XX
- XXall: uniqbib biblib
- XXinstall: iuniqbib ibiblib
- XXclean:
- XX rm -f *.o uniqbib biblib
- XX
- XXuniqbib: uniqbib.o biblib
- XX cc uniqbib.o ${LIBS} -o uniqbib
- XXiuniqbib: uniqbib
- XX install ${INSTALL} uniqbib ${BINDIR}
- XX
- XXuniqbib.o: bibinfo.h
- XX${LOBJS}: bibinfo.h
- XX
- XXbiblib: ${LOBJS}
- XX -rm -f biblib
- XX ar r biblib ${LOBJS}
- XX ranlib biblib
- XXibiblib:
- XX
- XXman: uniqbib.1
- XX ${TROFF} -{MANMACROS} uniqbib.1 | lpr
- XX
- XXshar:
- XX cshar -a uniqbib.c bibinfo.h BibAlloc.c BibCmp.c BibRead.c \
- XX BibWrite.c panic.c uniqbib.1 Makefile \
- XX bad1 bad2 bad3 > uniqbib.sha
- XX
- XXtest:
- XX @echo ERROR IS: too many authors
- XX -uniqbib bad1
- XX @echo ERROR IS: citation too long
- XX -uniqbib bad2
- XX @echo ERROR IS: first line is non-key line
- XX -uniqbib bad3
- SHAR_EOF
- if test 841 -ne "`wc -c Makefile`"
- then
- echo shar: error transmitting Makefile '(should have been 841 characters)'
- fi
- echo shar: extracting bad1 '(223 characters)'
- sed 's/^XX//' << \SHAR_EOF > bad1
- XX
- XX%A auth 1
- XX%A auth 2
- XX%A auth 3
- XX%A auth 4
- XX%A auth 5
- XX%A auth 6
- XX%A auth 7
- XX%A auth 8
- XX%A auth 9
- XX%A auth 10
- XX%A auth 11
- XX%A auth 12
- XX%A auth 13
- XX%A auth 14
- XX%A auth 15
- XX%A auth 16
- XX%A auth 17
- XX%A auth 18
- XX%A auth 19
- XX%A auth 20
- XX%A auth 21
- SHAR_EOF
- if test 223 -ne "`wc -c bad1`"
- then
- echo shar: error transmitting bad1 '(should have been 223 characters)'
- fi
- echo shar: extracting bad2 '(1025 characters)'
- sed 's/^XX//' << \SHAR_EOF > bad2
- XX%A aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
- XX%T ttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt
- XX%V vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
- XX%N nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
- XX%P PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
- XX%C ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
- XX%I iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
- XX%D ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd
- XX%K kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk
- XX%O ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
- XX%E eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
- XX%B bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
- XX%X xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
- SHAR_EOF
- if test 1025 -ne "`wc -c bad2`"
- then
- echo shar: error transmitting bad2 '(should have been 1025 characters)'
- fi
- echo shar: extracting bad3 '(3 characters)'
- sed 's/^XX//' << \SHAR_EOF > bad3
- XX
- XXt
- SHAR_EOF
- if test 3 -ne "`wc -c bad3`"
- then
- echo shar: error transmitting bad3 '(should have been 3 characters)'
- fi
- # End of shell archive
- exit 0
-
-