This program does not accept file names as arguments. To process more than one input file, pipe the output of cat(1) into sample.
The program does not need to know how many lines there are in the standard input, nor does it need to read the standard input more than once. However, the sample must fit into memory.
outputs 10 words selected at random from /usr/dict/words
outputs 16 (020 is an octal number) random users from the Yellow Pages.
outputs 30 randomly selected files in the current directory whose names match the pattern "*.o", but if any such file has a name exceeding 14 characters (not just the ones selected) it prints an error message and stops without outputting any file names.
outputs 100 lines drawn at random from foo and baz. It is possible that all the lines might come from foo or all of them from baz.
outputs 100 lines in random order with 50 of them coming from foo and 50 coming from baz, but randomly interleaved.
an input line is too long (-w option), or
the program is unable to obtain enough memory using malloc (if you want N lines each of which can contain W characters, you may need as much as (N+2)*(W+6) bytes), or
the command line is botched.
All error messages indicate the actual name the program was invoked by. If an error message is produced, the rest of the standard input is not read, so you are likely to get "Broken pipe" messages from upstream processes.
SYNOPSIS
cat file... | sample -# >output
WHAT IT DOES
sample -N
selects a random sample of N lines from its standard input
and writes them to its standard output. Although this has
the form of a filter, no output can be generated until all
of the input has been consumed. The sample is written out
in random order. All samples are generated with the same
probability, as are all permutations of each sample.
OPTIONS
It is a general policy of mine to make options insensitive
to case, and to reserve the options
-silent [as in many System V utilities]
-interactive [as in rm(1)]
-file [for command files as in awk(1) or sed(1)]
-e<C> [for an escape character]
-t<C> [for a field delimiter]
-width N [for the input line width limit]
-output F [as in cc(1) or sort(1)]
Other things being equal, my policy on file names is to treat
them as if there were an implicit "cat".
This program does not accept file names in its command line;
it only works from the standard input. The only relevant
"standard" option is -width.
If your records consist of more than one line, use
paste -s -d' file... | sample | tr '' '
with n-1 ^As in the paste argument to glue n lines together.
This will be handled by a -group option in a later version.
If you specify a sample size it is an error for the input
to contain fewer lines. You can get a random permutation
of all the input lines by writing
cat file ... >#scratch
sample -`wc -l #scratch` <#scratch >output
In addition to the sample size, you can specify the maximum
size of an input line by writing
-width #
-width=#
-w#
or any obvious variant.
Normally, a different sample will be generated every time
the program is run. To get the same sample, specify
-randomise #
-randomise=#
-r#
or any obvious variant.
All numbers in the command line are processed with strtol(3)
so decimal, octal, and hex can be used.
DEFAULTS
-width 1024
-10
SEE ALSO
Knuth, The Art of Computer Programming, Vol 2, page 138.
Knuth This program
n sample_size
t records_read_so_far+1
M random_choice+1
*/
#include <stdio.h> extern long strtol(/* char*, char**, int */); extern char *malloc(/* unsigned */); extern char *fgets(/* char*, unsigned, FILE* */); extern char *strcpy(/* char*, char* */); extern /*void*/ srandom(/* long */); extern long random(); extern long time(/* long* */);
#define IsDigit(c) ((unsigned)((c) - '0') < 10) #define Random(n) ((unsigned)(random()) % (unsigned)(n))
#define DFLT_SAMPLE_SIZE 10
#define DFLT_MAX_LINE_WIDTH 256
char *program_name; /* For error messages */
int max_line_width; /* -width : bound on length of input lines */
char *input_line; /* Dynamically allocated buffer */
int sample_size; /* -# : the number of lines to output */
int records_read_so_far; /* just what it says */
void OutputError()
{
(void) fprintf(stderr,
"%s: I/O error near byte %ld of output,
program_name, ftell(stdout));
exit(1);
}
/* estrtol(input, option, must_be_positive)
converts a string (input) to binary.
If must_be_positive is true, the result should be > 0.
If anything goes wrong, an error message is written to the
standard error stream and the program exits.
This should only be used for converting numeric program arguments.
Any integer value acceptable to C is accepted.
Note that both the strtol() and the sscanf() version will accept
leading white-space characters; I don't really want this, but it
seems harmless.
The sign check is actually useless in this program, because a
negative sign will be ineffective (-r-3) or rejected (-r -3)
no matter what.
*/
long estrtol(input, option, must_be_positive)
char *input; /* to be converted */
char *option; /* for error message */
int must_be_positive;
{
long result;
#ifdef NOSTRTOL
int nmatched;
char c;
if (input[0] != '0') {
/* decimal */
nmatched = sscanf(input, "%ld%c", &result, &c);
} else
if (input[1] == 'x' || input[1] == 'X') {
/* 0x: hexadecimal */
nmatched = sscanf(input+2, "%lx%c", &result, &c);
} else {
/* 0: octal */
nmatched = sscanf(input, "%lo%c", &result, &c);
}
if (nmatched != 1 || must_be_positive && result <= 0) {
#else
char *after;
result = strtol(input, &after, 0);
if (after == input || *after || must_be_positive && result <= 0) {
#endif
(void) fprintf(stderr,
"%s: ill-formed %s value: %s,
program_name, option, input);
exit(1);
}
return result;
}
/* read_input_line()
reads one complete line (terminated by a new-line character or the
end of the stream) from the standard input stream. If this line
contains at most max_line_width characters, including the new-line
character and a terminating NUL, those characters are stored in
input_line and a normal return is done. If the end of the stream
is encountered without any characters being read, NULL is returned
and input_line is undefined. If the line is too long, an error
message is printed and the program terminates.
Note that the user-supplied line width is incremented by 2 to allow
for the ' and ' '.
*/
char *read_input_line()
{
int L;
if (!fgets(input_line, max_line_width, stdin)) return NULL;
L = strlen(input_line);
if (input_line[L-1] != ') {
/* the input line is too long; fgets has left the */
/* rest of it unread in the standard input */
while (fgets(input_line, max_line_width, stdin)) {
int d = strlen(input_line);
L += d;
if (input_line[d-1] == ') break;
}
(void) fprintf(stderr,
"%s: line %d of the input has %d characters, which exceeds %d,
program_name, records_read_so_far+1, L-1, max_line_width-2);
exit(1);
}
return input_line;
}
/* copy_input_line()
copies the significant part of input_line into a newly allocated
block and returns that block. If more space cannot be allocated,
an error message is printed and the program terminates. Note
that the max_line_width has no effect on the size of block which
is allocated: only the actual length of the line matters.
*/
char *copy_input_line()
{
int L;
char *result;
L = strlen(input_line);
result = malloc((unsigned)(L+1));
if (!result) {
(void) fprintf(stderr,
"%s: unable to allocate %d bytes for line %d of the input,
program_name, L+1, records_read_so_far+1);
exit(1);
}
return strcpy(result, input_line);
}
/* base_part(filename)
takes a file name of the form xxx/xxx/xxx/basepart
and returns a pointer to the beginning of the basepart.
That is, to just after the last / if there is one, or to the
beginning of the filename is there is none.
In this program, this is just used in error messages.
*/
char *base_part(filename)
char *filename;
{
char *b;
for (b = filename; *b; b++)
if (*b == '/') filename = b+1;
return filename;
}
/* crack(argc, argv)
analyses the arguments. It has to set
program_name from argv[0]
max_line_width from -w[idth][ ]N
random seed from -r[andomise][ ]N
sample_size from -#
*/
void crack(argc, argv)
int argc;
char **argv;
{
int random_seed;
char **ap;
char *p;
program_name = "sample";
max_line_width = DFLT_MAX_LINE_WIDTH;
random_seed = (int)time((long*)0);
sample_size = DFLT_SAMPLE_SIZE;
if (argc == 0) return; /* can this ever happen? */
program_name = argv[0];
for (ap = argv; p = *++ap; ) {
if (*p != '-') {
(void) fprintf(stderr,
"%s: illegal argument %s,
program_name, p);
exit(1);
}
switch (*++p) {
case 'w': case 'W': /* -width */
while (*p && !IsDigit(*p)) p++;
if (!*p) {
/* The input is -w # as two arguments */
p = *++ap;
if (!p || !IsDigit(*p)) {
(void) fprintf(stderr,
"%s: no value for %s option,
program_name, ap[-1]);
exit(1);
}
}
/* Input was -wxxx|p or -wxxx |p */
max_line_width = estrtol(p, "-width", 1);
break;
case 'r': case 'R': /* -randomise */
while (*p && !IsDigit(*p)) p++;
if (!*p) {
/* The input is -r # as two arguments */
p = *++ap;
if (!p || !IsDigit(*p)) {
(void) fprintf(stderr,
"%s: no value for %s option,
program_name, ap[-1]);
exit(1);
}
}
/* Input was -rxxx|p or -rxxx |p */
random_seed = (int)estrtol(p, "-randomise", 0);
break;
default:
if (!IsDigit(*p)) {
(void) fprintf(stderr,
"%s: unknown option: %s,
program_name, ap[0]);
exit(1);
}
sample_size = estrtol(p, "-samplesize", 1);
break;
}
}
srandom(random_seed);
max_line_width += 2; /* to allow for "0" at the end */
}
main(argc, argv)
int argc;
char **argv;
{
int random_choice;
int n;
char **reservoir;
/* Analyse the arguments */
crack(argc, argv);
/* Try to allocate the line buffer */
input_line = malloc((unsigned)max_line_width);
if (!input_line) {
(void) fprintf(stderr,
"%s: unable to allocate %d bytes for line buffer,
program_name, max_line_width);
exit(1);
}
/* Try to allocate the reservoir */
reservoir = (char**) malloc((unsigned)sample_size * sizeof *reservoir);
if (!reservoir) {
(void) fprintf(stderr,
"%s: unable to allocate space for %d pointers,
program_name, sample_size);
exit(1);
}
/* Now fill the reservoir */
for (records_read_so_far = 0;
/*while*/ records_read_so_far < sample_size;
/*doing*/ records_read_so_far++) {
if (!read_input_line()) {
(void) fprintf(stderr,
"%s: input has only %d lines: %s %s,
program_name, records_read_so_far,
base_part(program_name), argv[1]);
exit(1);
}
reservoir[records_read_so_far] = copy_input_line();
}
/* read the rest of the input */
/* replacing reservoir entries at random */
while (read_input_line()) {
records_read_so_far++;
random_choice = Random(records_read_so_far);
if (random_choice < sample_size) {
(void)free(reservoir[random_choice]);
reservoir[random_choice] = copy_input_line();
}
}
/* Finally, copy the reservoir to the standard output */
/* We scramble it as we go */
for (n = 0; n < sample_size; n++) {
random_choice = Random(sample_size-n) + n;
if (fputs(reservoir[random_choice], stdout) < 0)
OutputError();
reservoir[random_choice] = reservoir[n];
}
exit(0);
}
------ EOF ------ ls -l sample.c