Usenet 1994 October

home *** CD-ROM | disk | FTP | other *** search

/ Usenet 1994 October / usenetsourcesnewsgroupsinfomagicoctober1994disk2.iso / unix / volume27 / mthreads / part03 / mt-process.c

Wrap

C/C++ Source or Header | 1993-11-20 | 44KB | 1,829 lines

/* $Id: mt-process.c,v 3.0 1993/10/01 00:14:03 davison Trn $ */ /* The authors make no claims as to the fitness or correctness of this software * for any use whatsoever, and it is provided as is. Any use of this software * is at the user's own risk. */ #include "EXTERN.h" #include "common.h" #include "thread.h" #include "mthreads.h" #include "ndir.h" #include "nntpclient.h" char references[1024]; char subject_str[80]; bool found_Re; char author_str[20]; ART_NUM absfirst; ART_NUM lastart; char *ctlarea; extern int log_verbosity, slow_down; long num; DOMAIN *next_domain; void insert_article(), expire(), trim_roots(), order_roots(), trim_authors(); void make_root(), use_root(), merge_roots(), set_root(), unlink_root(); void link_child(), unlink_child(); void free_article(), free_domain(), free_subject(), free_root(), free_author(); void get_subject_str(), get_author_str(); int valid_message_id _((char *, char *)); int subject_equal _((char *, char *)); ARTICLE *get_article(); SUBJECT *new_subject(); AUTHOR *new_author(); #ifndef USE_NNTP static FILE *fp_article; #endif /* Given the upper/lower bounds of the articles in the current group, add all ** the ones that we don't know about and remove all the ones that have expired. ** The current directory must be the newsgroup's spool directory. */ void process_articles(first_article, last_article) ART_NUM first_article, last_article; { register char *cp, *str; register ARTICLE *article; register ART_NUM i; time_t date; bool has_xrefs; int len; #ifdef USE_NNTP bool orig_extra = extra_expire; #else extern int sys_nerr; extern char *sys_errlist[]; #endif int start = total.last + 1; if (first_article > start) { start = first_article; } added_count = last_article - start + 1; if (added_count < 0) { added_count = 0; } else if (added_count > 1000) { /* Don't overwork ourselves the first time */ added_count = 1000; start = last_article - 1000 + 1; } expired_count = 0; for (i = start; i <= last_article; i++) { #ifdef USE_NNTP if (slow_down) { usleep(slow_down); } sprintf(ser_line, "HEAD %ld", (long)i); nntp_command(ser_line); if (nntp_check(FALSE) == NNTP_CLASS_FATAL) { last_article = i - 1; extra_expire = FALSE; break; } if (*ser_line != NNTP_CLASS_OK) { added_count--; continue; } #else /* Open article in current directory. */ sprintf(buf, "%ld", (long)i); /* Set errno for purely paranoid reasons */ errno = 0; if ((fp_article = fopen(buf, "r")) == Nullfp) { /* Missing files are ok -- they've just been expired or canceled */ if (errno != 0 && errno != ENOENT) { if (errno < 0 || errno > sys_nerr) { log_error("Can't open `%s': Error %d.\n", buf, errno); } else { log_error("Can't open `%s': %s.\n", buf, sys_errlist[errno]); } } added_count--; continue; } #endif article = Nullart; *references = '\0'; *author_str = '\0'; *subject_str = '\0'; found_Re = 0; date = 0; has_xrefs = FALSE; #ifdef USE_NNTP while (nntp_gets(cp = buf, sizeof buf) == 0) { process_line: if (*cp == '.') { if (cp[1]) { log_error("Header line starts with '.'! [%ld].\n", (long)i); continue; } break; } #else while ((cp = fgets(buf, sizeof buf, fp_article)) != Nullch) { process_line: if (*cp == '\n') { /* check for end of header */ break; /* break out when found */ } #endif if ((unsigned char)*cp <= ' ') { /* skip continuation lines */ continue; /* (except references -- see below) */ } if ((str = index(cp, ':')) == Nullch) { #ifdef USE_NNTP if (log_verbosity) { log_error("Header line missing colon! [%ld].\n", (long)i); } continue; /* skip bogus header line */ #else break; /* end of header if no colon found */ #endif } if ((len = str - cp) > 10) { continue; /* skip keywords > 10 chars */ } #ifndef USE_NNTP cp[strlen(cp)-1] = '\0'; /* remove newline */ #endif while (cp < str) { /* lower-case the keyword */ if ((unsigned char)*cp <= ' ') { /* stop at any whitespace */ break; } if (isupper(*cp)) { *cp = tolower(*cp); } cp++; } *cp = '\0'; cp = buf; if (len == 4 && strEQ(cp, "date")) { date = parsedate(str + 1); } else if (len == 4 && strEQ(cp, "from")) { get_author_str(str + 1); } else if (len == 4 && strEQ(cp, "xref")) { has_xrefs = TRUE; } else if (len == 7 && strEQ(cp, "subject")) { get_subject_str(str + 1); } else if (len == 10 && strEQ(cp, "message-id")) { if (!article) { article = get_article(str + 1); } else { if (log_verbosity) { log_error("Found multiple Message-IDs! [%ld].\n", (long)i); } } } else if (len == 10 && strEQ(cp, "references")) { /* include preceding space in saved reference */ len = strlen(str + 1); bcopy(str + 1, references, len + 1); str = references + len; /* check for continuation lines */ #ifdef USE_NNTP while (nntp_gets(cp = buf, sizeof buf) == 0) { #else while ((cp = fgets(buf, sizeof buf, fp_article)) != Nullch) { #endif if (*cp != ' ' && *cp != '\t') { goto process_line; } while (*++cp == ' ' || *cp == '\t') { ; } *--cp = ' '; /* If the references are too long, shift them over to ** always save the most recent ones. */ if ((len += strlen(cp)) > 1023) { strcpy(buf, buf + len - 1023); str -= len - 1023; len = 1023; } strcpy(str, cp); }/* while */ break; }/* if */ }/* while */ if (article) { num = i; insert_article(article, date); if (has_xrefs) { article->flags |= HAS_XREFS; } } else { if (log_verbosity) { log_error("Message-ID line missing! [%ld].\n", (long)i); } } #ifndef USE_NNTP fclose(fp_article); #endif } if (extra_expire || first_article > total.first) { absfirst = first_article; lastart = last_article; expire(first_article <= last_article ? extra_expire : FALSE); } trim_roots(); order_roots(); trim_authors(); total.first = first_article; total.last = last_article; #ifdef USE_NNTP extra_expire = orig_extra; #endif } /* Search all articles for numbers less than new_first. Traverse the list ** using the domain links so we don't have to deal with the tree structure. ** If extra is true, list all articles in the directory to setup a bitmap ** with the existing articles marked as 'read', and drop everything that ** isn't there. */ void expire(extra) bool_int extra; { register DOMAIN *domain; register ARTICLE *article, *next_art, *hold; register ART_NUM art; #ifdef USE_NNTP static int listgroup_type = CHECK_LISTGROUP; extern char line[]; /* line contains the group name */ #else register DIR *dirp; #endif if (extra) { MEM_SIZE ctlsize; /* Allocate a bitmap large enough for absfirst thru lastart. */ ctlsize = (MEM_SIZE)(OFFSET(lastart)/BITSPERBYTE+20); ctlarea = safemalloc(ctlsize); bzero(ctlarea, ctlsize); /* List all articles and use ctl_set() to keep track of what's there. */ #ifdef USE_NNTP try_again: switch (listgroup_type) { case GOOD_LISTGROUP: nntp_command("LISTGROUP"); (void)nntp_check(FALSE); break; case BAD_LISTGROUP: sprintf(ser_line, "LISTGROUP %s", line); nntp_command(ser_line); (void)nntp_check(FALSE); break; case CHECK_LISTGROUP: /* Check if LISTGROUP is available. */ nntp_command("LISTGROUP"); if (nntp_check(FALSE) == NNTP_CLASS_OK) { listgroup_type = GOOD_LISTGROUP; } else if (atoi(ser_line) == NNTP_SYNTAX_VAL) { /* A command syntax error (not an unrecongnized command) is ** the LISTGROUP that takes a newsgroup name. */ listgroup_type = BAD_LISTGROUP; goto try_again; } else { listgroup_type = NO_LISTGROUP; goto try_again; } break; default: sprintf(ser_line,"XHDR lines %ld-%ld",(long)absfirst,(long)lastart); nntp_command(ser_line); (void)nntp_check(FALSE); } if (*ser_line == NNTP_CLASS_OK) { while (1) { if (nntp_gets(buf, sizeof buf) < 0) { extra = 0; break; } if (*buf == '.') { break; } art = atol(buf); if (art >= absfirst && art <= lastart) { ctl_set(art); } } } else { extra = 0; } #else if ((dirp = opendir(".")) != 0) { register struct direct *dp; while ((dp = readdir(dirp)) != Null(struct direct *)) { register char *p; for (p = dp->d_name; *p; p++) { if (!isdigit(*p)) { goto nope; } } art = atol(dp->d_name); if (art >= absfirst && art <= lastart) { ctl_set(art); } nope: ; } closedir(dirp); } else { extra = 0; } #endif } else { ctlarea = Nullch; } for (domain = &unk_domain; domain; domain = next_domain) { next_domain = domain->link; for (article = domain->ids; article; article = next_art) { next_art = article->id_link; if (!article->subject) { continue; } if (article->num < absfirst || (extra && !ctl_check(article->num))) { article->subject->count--; article->subject = 0; article->flags &= ~HAS_XREFS; article->author->count--; article->author = 0; /* Free expired article if it has no children. Then check ** if the parent(s) are also fake and can be freed. We'll ** free any empty roots later. */ while (!article->children) { hold = article->parent; unlink_child(article); free_article(article); if (hold && !hold->subject) { if ((article = hold) == next_art) { next_art = next_art->id_link; } } else { break; } } expired_count++; }/* if */ }/* for */ }/* for */ next_domain = Null(DOMAIN*); safefree(&ctlarea); } /* Trim the article chains down so that we don't have more than one faked ** article between the root and any real ones. */ void trim_roots() { register ROOT *root, *last_root; register ARTICLE *article, *next; register SUBJECT *subject, *last_subj; register int found; #ifndef lint last_root = (ROOT *)&root_root; #else last_root = Null(ROOT*); #endif for (root = root_root; root; root = last_root->link) { for (article = root->articles; article; article = article->siblings) { /* If an article has no subject, it is a "fake" reference node. ** If all of its immediate children are also fakes, delete it ** and graduate the children to the root. If everyone is fake, ** the chain dies. */ while (!article->subject) { found = 0; for (next = article->children; next; next = next->siblings) { if (next->subject) { found = 1; break; } } if (!found) { /* Remove this faked article and move all its children ** up to the root. */ next = article->children; unlink_child(article); free_article(article); for (article = next; article; article = next) { next = article->siblings; article->parent = Nullart; link_child(article); } article = root->articles; /* start this root over */ } else { break; /* else, on to next article */ } } } /* Free all unused subject strings. Begin by trying to find a ** subject for the root's pointer. */ for (subject = root->subjects; subject && !subject->count; subject = root->subjects) { root->subjects = subject->link; free_subject(subject); root->subject_cnt--; } /* Then free up any unused intermediate subjects. */ if ((last_subj = subject) != Null(SUBJECT*)) { while ((subject = subject->link) != Null(SUBJECT*)) { if (!subject->count) { last_subj->link = subject->link; free_subject(subject); root->subject_cnt--; subject = last_subj; } else { last_subj = subject; } } } /* Now, free all roots without articles. Flag unexpeced errors. */ if (!root->articles) { if (root->subjects) { log_error("** Empty root still had subjects remaining! **\n"); } last_root->link = root->link; free_root(root); } else { last_root = root; } } } /* Descend the author list, find any author names that aren't used ** anymore and free them. */ void trim_authors() { register AUTHOR *author, *last_author; #ifndef lint last_author = (AUTHOR *)&author_root; #else last_author = Null(AUTHOR*); #endif for (author = author_root; author; author = last_author->link) { if (!author->count) { last_author->link = author->link; free_author(author); } else { last_author = author; } } } /* Reorder the roots to place the oldest ones first (age determined by ** date of oldest article). */ void order_roots() { register ROOT *root, *next, *search, *link; /* If we don't have at least two roots, we're done! */ if (!(root = root_root) || !(next = root->link)) { return; /* RETURN */ } /* Break the old list off after the first root, and then start ** inserting the roots into the list by date. */ root->link = Null(ROOT*); while ((root = next) != Null(ROOT*)) { next = next->link; if ((search = root_root)->articles->date >= root->articles->date) { root->link = root_root; root_root = root; } else { register time_t radate = root->articles->date; while ((link = search->link) != NULL && link->articles->date < radate) { search = link; } root->link = link; search->link = root; } } } #define EQ(x,y) ((isupper(x) ? tolower(x) : (x)) == (y)) /* Parse the subject into 72 characters or less. Remove any "Re[:^]"s from ** the front (noting that it's there), and any "(was: old)" stuff from ** the end. Then, compact multiple whitespace characters into one space, ** trimming leading/trailing whitespace. If it's still too long, unmercifully ** cut it off. We don't bother with subject continuation lines either. */ void get_subject_str(str) register char *str; { register char *cp; register int len; while (*str && (unsigned char)*str <= ' ') { str++; } if (!*str) { bcopy("<None>", subject_str, 7); return; /* RETURN */ } cp = str; while (EQ(cp[0], 'r') && EQ(cp[1], 'e')) { /* check for Re: */ cp += 2; if (*cp == '^') { /* allow Re^2: */ while (*++cp <= '9' && *cp >= '0') { ; } } if (*cp != ':') { break; } while (*++cp == ' ') { ; } found_Re = 1; str = cp; } /* Remove "(was: oldsubject)", because we already know the old subjects. ** Also match "(Re: oldsubject)". Allow possible spaces after the ('s. */ for (cp = str; (cp = index(cp+1, '(')) != Nullch;) { while (*++cp == ' ') { ; } if (EQ(cp[0], 'w') && EQ(cp[1], 'a') && EQ(cp[2], 's') && (cp[3] == ':' || cp[3] == ' ')) { *--cp = '\0'; break; } if (EQ(cp[0], 'r') && EQ(cp[1], 'e') && ((cp[2]==':' && cp[3]==' ') || (cp[2]=='^' && cp[4]==':'))) { *--cp = '\0'; break; } } /* Copy subject to a temporary string, compacting multiple spaces/tabs */ for (len = 0, cp = subject_str; len < 72 && *str; len++) { if ((unsigned char)*str <= ' ') { while (*++str && (unsigned char)*str <= ' ') { ; } *cp++ = ' '; } else { *cp++ = *str++; } } if (cp[-1] == ' ') { cp--; } *cp = '\0'; } #ifndef OLD_AUTHOR_CODE /* Name-munging routines written by Ross Ridge. Public Domain. ** Enhanced by Wayne Davison. */ /* If necessary, compress a net user's full name by playing games with ** initials and the middle name(s). If we start with "Ross Douglas Ridge" ** we try "Ross D Ridge", "Ross Ridge", "R D Ridge" and finally "R Ridge" ** before simply truncating the thing. We also turn "R. Douglas Ridge" ** into "Douglas Ridge" and "Ross Ridge D.D.S." into "Ross Ridge" as a ** first step of the compaction, if needed. */ static char * compress_name(name, max) char *name; int max; { register char *s, *last, *mid, *d; register int len, namelen, midlen; int notlast; /* First remove white space from both ends. */ while (isspace(*name)) { name++; } if ((len = strlen(name)) == 0) { return name; } s = name + len - 1; while (isspace(*s)) { s--; } s[1] = '\0'; if (s - name + 1 <= max) { return name; } /* Look for characters that likely mean the end of the name ** and the start of some hopefully uninteresting additional info. ** Spliting at a comma is somewhat questionalble, but since ** "Ross Ridge, The Great HTMU" comes up much more often than ** "Ridge, Ross" and since "R HTMU" is worse than "Ridge" we do ** it anyways. */ for (d = name + 1; *d; d++) { if (*d == ',' || *d == ';' || *d == '(' || *d == '@' || (*d == '-' && (d[1] == '-' || d[1] == ' '))) { *d-- = '\0'; s = d; break; } } /* Find the last name */ do { notlast = 0; while (isspace(*s)) { s--; } s[1] = '\0'; len = s - name + 1; if (len <= max) { return name; } /* If the last name is an abbreviation it's not the one we want. */ if (*s == '.') notlast = 1; while (!isspace(*s)) { if (s == name) { /* only one name */ name[max] = '\0'; return name; } if (isdigit(*s)) { /* probably a phone number */ notlast = 1; /* so chuck it */ } s--; } } while (notlast); last = s-- + 1; /* Look for a middle name */ while (isspace(*s)) { /* get rid of any extra space */ len--; s--; } mid = name; while (!isspace(*mid)) { mid++; } namelen = mid - name + 1; if (mid == s+1) { /* no middle name */ mid = 0; midlen = 0; } else { *mid++ = '\0'; while (isspace(*mid)) { len--; mid++; } midlen = s - mid + 2; /* If first name is an initial and middle isn't and it all fits ** without the first initial, drop it. */ if (len > max && mid != s && mid[1] != '.' && (!name[1] || (name[1] == '.' && !name[2])) && len - namelen <= max) { len -= namelen; name = mid; mid = 0; } } s[1] = '\0'; if (mid && len > max) { /* Turn middle names into intials */ len -= s - mid + 2; d = s = mid; while (*s) { if (isalpha(*s)) { if (d != mid) { *d++ = ' '; } *d++ = *s++; } while (*s && !isspace(*s)) { s++; } while (isspace(*s)) { s++; } } if (d != mid) { *d = '\0'; midlen = d - mid + 1; len += midlen; } else { mid = 0; } } if (len > max) { /* If the first name fits without the middle initials, drop them */ if (mid && len - midlen <= max) { len -= midlen; mid = 0; } else { /* Turn the first name into an initial */ len -= namelen - 2; name[1] = '\0'; namelen = 2; if (len > max) { /* Dump the middle initials (if present) */ if (mid) { len -= midlen; mid = 0; } if (len > max) { /* Finally just truncate the last name */ last[max - 2] = '\0'; } } } } /* Paste the names back together */ d = name + namelen; if (mid) { d[-1] = ' '; strcpy(d, mid); d += midlen; } d[-1] = ' '; strcpy(d, last); return name; } /* Compress an email address, trying to keep as much of the local part of ** the addresses as possible. The order of precence is @ ! %, but ** @ % ! may be better... */ static char * compress_address(name, max) char *name; int max; { char *s, *at, *bang, *hack, *start; int len; /* Remove white space from both ends. */ while (isspace(*name)) { name++; } if ((len = strlen(name)) == 0) { return name; } s = name + len - 1; while (isspace(*s)) { s--; } s[1] = '\0'; if (*name == '<') { name++; if (*s == '>') { *s-- = '\0'; } } if ((len = s - name + 1) <= max) { return name; } at = bang = hack = NULL; for (s = name + 1; *s; s++) { /* If there's whitespace in the middle then it's probably not ** really an email address. */ if (isspace(*s)) { name[max] = '\0'; return name; } switch (*s) { case '@': if (at == NULL) { at = s; } break; case '!': if (at == NULL) { bang = s; hack = NULL; } break; case '%': if (at == NULL && hack == NULL) { hack = s; } break; } } if (at == NULL) { at = name + len; } if (hack != NULL) { if (bang != NULL) { if (at - bang - 1 >= max) { start = bang + 1; } else if (at - name >= max) { start = at - max; } else { start = name; } } else { start = name; } } else if (bang != NULL) { if (at - name >= max) { start = at - max; } else { start = name; } } else { start = name; } if (len - (start - name) > max) { start[max] = '\0'; } return start; } /* Extract the full-name part of an email address, returning NULL if not ** found. */ static char * extract_name(name) char *name; { char *s; char *lparen, *rparen; char *langle; while (isspace(*name)) { name++; } lparen = index(name, '('); rparen = rindex(name, ')'); langle = index(name, '<'); if (!lparen && !langle) { return NULL; } else if (langle && (!lparen || !rparen || lparen > langle || rparen < langle)) { if (langle == name) { return NULL; } *langle = '\0'; } else { name = lparen; *name++ = '\0'; while (isspace(*name)) { name++; } if (name == rparen) { return NULL; } if (rparen != NULL) { *rparen = '\0'; } } if (*name == '"') { name++; while (isspace(*name)) { name++; } if ((s = rindex(name, '"')) != NULL) { *s = '\0'; } } return name; } /* Try to fit the author name in 16 bytes. Use the comment portion if ** present. */ void get_author_str(addr) char *addr; { char *s; /* TODO: Do we need to eliminate ctrl chars here? */ if ((s = extract_name(addr)) != NULL) { s = compress_name(s, 16); } else { s = compress_address(addr, 16); } strcpy(author_str, s); } #else /* Here's the old, simple method in case someone wants it. */ /* Try to fit the author name in 16 bytes. Use the comment portion in ** parenthesis if present. Cut off non-commented names at the '@' or '%'. ** Then, put as many characters as we can into the 16 bytes, packing multiple ** whitespace characters into a single space. */ void get_author_str(str) char *str; { register char *cp, *cp2; if ((cp = index(str, '(')) != Nullch) { str = cp+1; if ((cp = rindex(str, ')')) != Nullch) { *cp = '\0'; } } else { if ((cp = index(str, '@')) != Nullch) { *cp = '\0'; } if ((cp = index(str, '%')) != Nullch) { *cp = '\0'; } } for (cp = str, cp2 = author_str; *cp && cp2-author_str < 16;) { /* Pack white space and turn ctrl-chars into spaces. */ if (*cp <= ' ') { while (*++cp && *cp <= ' ') { ; } if (cp2 != author_str) { *cp2++ = ' '; } } else { *cp2++ = *cp++; } } *cp2 = '\0'; } #endif /* Take a message-id and see if we already know about it. If so, return it. ** If not, create it. We separate the id into its id@domain parts, and ** link all the unique ids to one copy of the domain portion. This saves ** a bit of space. */ ARTICLE * get_article(msg_id) char *msg_id; { register DOMAIN *domain; register ARTICLE *article; register char *cp, *after_at; /* Take message id, break it up into <id@domain>, and try to match it. */ while (*msg_id == ' ') { msg_id++; } cp = msg_id + strlen(msg_id) - 1; if (msg_id >= cp) { if (log_verbosity) { log_error("Message-ID is empty! [%ld]\n", num); } return Nullart; } if (*msg_id++ != '<') { if (log_verbosity) { log_error("Message-ID doesn't start with '<' [%ld]\n", num); } msg_id--; } if (*cp != '>') { if (log_verbosity) { log_error("Message-ID doesn't end with '>' [%ld]\n", num); } cp++; } *cp = '\0'; if (msg_id == cp) { if (log_verbosity) { log_error("Message-ID is null! [%ld]\n", num); } return Nullart; } if ((after_at = index(msg_id, '@')) == Nullch) { domain = &unk_domain; } else { *after_at++ = '\0'; for (cp = after_at; *cp; cp++) { if (isupper(*cp)) { *cp = tolower(*cp); /* lower-case domain portion */ } } *cp = '\0'; /* Try to find domain name in database. */ for (domain = unk_domain.link; domain; domain = domain->link) { if (strEQ(domain->name, after_at)) { break; } } if (!domain) { /* if domain doesn't exist, create it */ register int len = cp - after_at + 1; domain = (DOMAIN *)safemalloc(sizeof (DOMAIN)); total.domain++; domain->name = safemalloc(len); total.string2 += len; bcopy(after_at, domain->name, len); domain->ids = Nullart; domain->link = unk_domain.link; unk_domain.link = domain; } } /* Try to find id in this domain. */ for (article = domain->ids; article; article = article->id_link) { if (strEQ(article->id, msg_id)) { break; } } if (!article) { /* If it doesn't exist, create an article */ register int len = strlen(msg_id) + 1; article = (ARTICLE *)safemalloc(sizeof (ARTICLE)); bzero(article, sizeof (ARTICLE)); total.article++; article->num = 0; article->id = safemalloc(len); total.string2 += len; bcopy(msg_id, article->id, len); article->domain = domain; article->id_link = domain->ids; domain->ids = article; } return article; } /* Take all the data we've accumulated about the article and shove it into ** the article tree at the best place we can possibly imagine. */ void insert_article(article, date) ARTICLE *article; time_t date; { register ARTICLE *node, *last; register char *cp, *end; #ifndef USE_NNTP int len; #endif if (article->subject) { if (log_verbosity) { log_error("We've already seen article #%ld (%s@%s)\n", num, article->id, article->domain->name); } return; /* RETURN */ } article->date = date; article->num = num; article->flags = 0; if (!*references && found_Re) { if (log_verbosity > 1) { log_error("Missing reference line! [%ld]\n", num); } } /* If the article has a non-zero root, it is already in a thread somewhere. ** Unlink it to try to put it in the best possible spot. */ if (article->root) { /* Check for a real or shared-fake parent. Articles that have never ** existed have a num of 0. Expired articles that remain as references ** have a valid num. (Valid date too, but no subject.) */ for (node = article->parent; node && !node->num && node->child_cnt == 1; node = node->parent) { ; } unlink_child(article); if (node) { /* do we have decent parents? */ /* Yes: assume that our references are ok, and just reorder us ** with our siblings by date. */ link_child(article); use_root(article, article->root); /* Freshen the date in any faked parent articles. */ for (node = article->parent; node && !node->num && date < node->date; node = node->parent) { node->date = date; unlink_child(node); link_child(node); } return; /* RETURN */ } /* We'll assume that this article has as good or better references ** than the child that faked us initially. Free the fake reference- ** chain and process our references as usual. */ for (node = article->parent; node; node = last) { unlink_child(node); last = node->parent; free_article(node); } article->parent = Nullart; /* neaten up */ article->siblings = Nullart; } check_references: if (!*references) { /* If no references but "Re:" in subject, */ if (found_Re) { /* search for a reference in any cited text */ #ifndef USE_NNTP for (len = 4; len && fgets(buf, sizeof buf, fp_article); len--) { if ((cp = index(buf, '<')) && (end = index(cp, ' '))) { if (end[-1] == ',') { end--; } *end = '\0'; if ((end = index(cp, '>')) == Nullch) { end = cp + strlen(cp) - 1; } if (valid_message_id(cp, end)) { strcpy(references+1, cp); *references = ' '; if (log_verbosity > 2) { log_error("Found cited-text reference: '%s' [%ld]\n", references+1, num); } break; } } } #endif } else { article->flags |= ROOT_ARTICLE; } } /* If we have references, process them from the right end one at a time ** until we either run into somebody, or we run out of references. */ if (*references) { last = article; node = Nullart; end = references + strlen(references) - 1; while ((cp = rindex(references, '<')) != Nullch) { while (end >= cp && ((unsigned char)*end <= ' ' || *end == ',')) { end--; } end[1] = '\0'; /* Quit parsing references if this one is garbage. */ if (!valid_message_id(cp, end)) { if (log_verbosity) { log_error("Bad ref '%s' [%ld]\n", cp, num); } break; } /* Dump all domains that end in '.', such as "..." & "1@DEL." */ if (end[-1] == '.') { break; } node = get_article(cp); *cp = '\0'; /* Check for duplicates on the reference line. Brand-new data has ** no date. Data we just allocated earlier on this line has a ** date but no root. Special-case the article itself, since it ** MIGHT have a root. */ if ((node->date && !node->root) || node == article) { if (log_verbosity) { log_error("Reference line contains duplicates [%ld]\n", num); } if ((node = last) == article) { node = Nullart; } continue; } last->parent = node; link_child(last); if (node->root) { break; } node->date = date; last = node; end = cp-1; } if (!node) { *references = '\0'; goto check_references; } /* Check if we ran into anybody that was already linked. If so, we ** just use their root. */ if (node->root) { /* See if this article spans the gap between what we thought ** were two different roots. */ if (article->root && article->root != node->root) { merge_roots(node->root, article->root); /* Set the roots of any children we brought with us. */ set_root(article, node->root); } use_root(article, node->root); } else { /* We didn't find anybody we knew, so either create a new root or ** use the article's root if it was previously faked. */ if (!article->root) { make_root(node); use_root(article, node->root); } else { node->root = article->root; link_child(node); use_root(article, article->root); } } /* Set the roots of the faked articles we created as references. */ for (node = article->parent; node && !node->root; node = node->parent) { node->root = article->root; } /* Make sure we didn't circularly link to a child article(!), by ** ensuring that we run into the root before we run into ourself. */ while (node && node->parent != article) { node = node->parent; } if (node) { /* Ugh. Someone's tweaked reference line with an incorrect ** article-order arrived first, and one of our children is ** really one of our ancestors. Cut off the bogus child branch ** right where we are and link it to the root. */ if (log_verbosity) { log_error("Found ancestral child -- fixing.\n"); } unlink_child(node); node->parent = Nullart; link_child(node); } } else { /* The article has no references. Either turn it into a new root, or ** re-attach fleshed-out (previously faked) article to its old root. */ if (!article->root) { make_root(article); } else { link_child(article); use_root(article, article->root); } } } /* Check if the string we've found looks like a valid message-id reference. */ int valid_message_id(start, end) register char *start, *end; { char *mid; if (start == end) { return 0; } if (*end != '>') { /* Compensate for space cadets who include the header in their ** subsitution of all '>'s into another citation character. */ if (*end == '<' || *end == '-' || *end == '!' || *end == '%' || *end == ')' || *end == '|' || *end == ':' || *end == '}' || *end == '*' || *end == '+' || *end == '#' || *end == ']' || *end == '@' || *end == '$') { if (log_verbosity) { log_error("Reference ended in '%c' [%ld]\n", *end, num); } *end = '>'; } } else if (end[-1] == '>') { if (log_verbosity) { log_error("Reference ended in '>>' [%ld]\n", num); } *(end--) = '\0'; } /* Id must be "<...@...>" */ if (*start != '<' || *end != '>' || (mid = index(start, '@')) == Nullch || mid == start+1 || mid+1 == end) { return 0; /* RETURN */ } return 1; } /* Remove an article from its parent/siblings. Leave parent pointer intact. */ void unlink_child(child) register ARTICLE *child; { register ARTICLE *last; if (!(last = child->parent)) { child->root->thread_cnt--; if ((last = child->root->articles) == child) { child->root->articles = child->siblings; } else { goto sibling_search; } } else { last->child_cnt--; if (last->children == child) { last->children = child->siblings; } else { last = last->children; sibling_search: while (last->siblings != child) { last = last->siblings; } last->siblings = child->siblings; } } } /* Link an article to its parent article. If its parent pointer is zero, ** link it to its root. Sorts siblings by date. */ void link_child(child) register ARTICLE *child; { register ARTICLE *node; register ROOT *root; if (!(node = child->parent)) { root = child->root; root->thread_cnt++; node = root->articles; if (!node || child->date < node->date) { child->siblings = node; root->articles = child; } else { goto sibling_search; } } else { node->child_cnt++; node = node->children; if (!node || child->date < node->date) { child->siblings = node; child->parent->children = child; } else { sibling_search: for (; node->siblings; node = node->siblings) { if (node->siblings->date > child->date) { break; } } child->siblings = node->siblings; node->siblings = child; } } } /* Create a new root for the specified article. If the current subject_str ** matches any pre-existing root's subjects, we'll instead add it on as a ** parallel thread. */ void make_root(article) register ARTICLE *article; { register ROOT *new, *node; register SUBJECT *subject; #ifndef NO_SUBJECT_MATCHING /* First, check the other root's subjects for a match. */ for (node = root_root; node; node = node->link) { for (subject = node->subjects; subject; subject = subject->link) { if (subject_equal(subject->str, subject_str)) { use_root(article, node); /* use it instead */ link_child(article); return; /* RETURN */ } } } #endif /* Create a new root. */ new = (ROOT *)safemalloc(sizeof (ROOT)); total.root++; new->articles = article; new->root_num = article->num; new->thread_cnt = 1; if (article->num) { article->author = new_author(); new->subject_cnt = 1; new->subjects = article->subject = new_subject(); } else { new->subject_cnt = 0; new->subjects = Null(SUBJECT*); } article->root = new; new->link = root_root; root_root = new; } /* Add this article's subject onto the indicated root's list. Point the ** article at the root. */ void use_root(article, root) ARTICLE *article; ROOT *root; { register SUBJECT *subject; register ROOT *root2; SUBJECT *hold, *child_subj = Null(SUBJECT*), *sib_subj = Null(SUBJECT*); ARTICLE *node; article->root = root; /* If it's a fake, there's no subject to add. */ if (!article->num) { return; /* RETURN */ } /* If we haven't picked a unique message number to represent this root, ** use the first non-zero number we encounter. Which one doesn't matter. */ if (!root->root_num) { root->root_num = article->num; } article->author = new_author(); /* Check if the new subject matches any of the other subjects in this root. ** If so, we just update the count. If not, check all the other roots for ** a match. If found, the new subject is common between the two roots, so ** we merge the two roots together. */ root2 = root; #ifndef NO_SUBJECT_MATCHING do { #endif for (subject = root2->subjects; subject; subject = subject->link) { if (subject_equal(subject->str, subject_str)) { article->subject = subject; subject->count++; #ifndef NO_SUBJECT_MATCHING if (root2 != root) { merge_roots(root, root2); } #endif return; /* RETURN */ } } #ifndef NO_SUBJECT_MATCHING if ((root2 = root2->link) == Null(ROOT*)) { root2 = root_root; } } while (root2 != root); #endif article->subject = hold = new_subject(); root->subject_cnt++; /* Find the subject of any pre-existing children or siblings. We want ** to insert the new subject before one of these to keep the numbering ** intuitive in the newsreader. Never insert prior to our parent's ** subject, however. */ for (node = article->children; node; node = node->children) { if (node->subject) { child_subj = node->subject; break; } } for (node = article->siblings; node; node = node->siblings) { if (node->subject) { sib_subj = node->subject; break; } } if (article->parent) { if (article->parent->subject == child_subj) { child_subj = Null(SUBJECT*); } if (article->parent->subject == sib_subj) { sib_subj = Null(SUBJECT*); } } if (!(subject = root->subjects) || subject == child_subj || subject == sib_subj) { hold->link = root->subjects; root->subjects = hold; } else { while (subject->link && subject->link != child_subj && subject->link != sib_subj) { subject = subject->link; } hold->link = subject->link; subject->link = hold; } } /* Check subjects in a case-insignificant, punctuation-ignoring manner. */ int subject_equal(str1, str2) register char *str1, *str2; { register char ch1, ch2; while ((ch1 = *str1++)) { if (ch1 == ' ' || ispunct(ch1)) { while (*str1 && (*str1 == ' ' || ispunct(*str1))) { str1++; } ch1 = ' '; } else if (isupper(ch1)) { ch1 = tolower(ch1); } if (!(ch2 = *str2++)) { return 0; } if (ch2 == ' ' || ispunct(ch2)) { while (*str2 && (*str2 == ' ' || ispunct(*str2))) { str2++; } ch2 = ' '; } else if (isupper(ch2)) { ch2 = tolower(ch2); } if (ch1 != ch2) { return 0; } } if (*str2) { return 0; } return 1; } /* Create a new subject structure. */ SUBJECT * new_subject() { register int len = strlen(subject_str) + 1; register SUBJECT *subject; subject = (SUBJECT *)safemalloc(sizeof (SUBJECT)); total.subject++; subject->count = 1; subject->link = Null(SUBJECT*); subject->str = safemalloc(len); total.string1 += len; bcopy(subject_str, subject->str, len); return subject; } /* Create a new author structure. */ AUTHOR * new_author() { register len = strlen(author_str) + 1; register AUTHOR *author, *last_author; last_author = Null(AUTHOR*); for (author = author_root; author; author = author->link) { #ifndef DONT_COMPARE_AUTHORS /* might like to define this to save time */ if (strEQ(author->name, author_str)) { author->count++; return author; /* RETURN */ } #endif last_author = author; } author = (AUTHOR *)safemalloc(sizeof (AUTHOR)); total.author++; author->count = 1; author->link = Null(AUTHOR*); author->name = safemalloc(len); total.string1 += len; bcopy(author_str, author->name, len); if (last_author) { last_author->link = author; } else { author_root = author; } return author; } /* Insert all of root2 into root1, setting the proper root values and ** updating subject counts. */ void merge_roots(root1, root2) ROOT *root1, *root2; { register ARTICLE *node, *next; register SUBJECT *subject; /* Remember whoever's root num is lower. This could screw up a ** newsreader's kill-thread code if someone already saw the roots as ** being separate, but it must be done. The newsreader code will have ** to handle this as best as it can. */ if (root1->root_num > root2->root_num) { root1->root_num = root2->root_num; } for (node = root2->articles; node; node = next) { /* For each article attached to root2: detach it, set the branch's ** root pointer to root1, and then attach it to root1. */ next = node->siblings; unlink_child(node); node->siblings = Nullart; set_root(node, root1); /* sets children too */ /* Link_child() depends on node->parent being null and node->root ** being set. */ link_child(node); } root1->subject_cnt += root2->subject_cnt; if (!(subject = root1->subjects)) { root1->subjects = root2->subjects; } else { while (subject->link) { subject = subject->link; } subject->link = root2->subjects; } unlink_root(root2); free_root(root2); } /* When merging roots, we need to reset all the root pointers. */ void set_root(node, root) ARTICLE *node; ROOT *root; { while (node) { node->root = root; if (node->children) { set_root(node->children, root); } node = node->siblings; } } /* Unlink a root from its neighbors. */ void unlink_root(root) register ROOT *root; { register ROOT *node; if ((node = root_root) == root) { root_root = root->link; } else { while (node->link != root) { node = node->link; } node->link = root->link; } } /* Free an article and its message-id string. All other resources must ** already be free, and it must not be attached to any threads. */ void free_article(this) ARTICLE *this; { register ARTICLE *art; if ((art = this->domain->ids) == this) { if (!(this->domain->ids = this->id_link)) { free_domain(this->domain); } } else { while (this != art->id_link) { art = art->id_link; } art->id_link = this->id_link; } total.string2 -= strlen(this->id) + 1; free(this->id); free(this); total.article--; } /* Free the domain only when its last unique id has been freed. */ void free_domain(this) DOMAIN *this; { register DOMAIN *domain; if (this == (domain = &unk_domain)) { return; } if (this == next_domain) { /* help expire routine skip freed domains */ next_domain = next_domain->link; } while (this != domain->link) { domain = domain->link; } domain->link = this->link; total.string2 -= strlen(this->name) + 1; free(this->name); free(this); total.domain--; } /* Free the subject structure and its string. */ void free_subject(this) SUBJECT *this; { total.string1 -= strlen(this->str) + 1; free(this->str); free(this); total.subject--; } /* Free a root. It must already be unlinked. */ void free_root(this) ROOT *this; { free(this); total.root--; } /* Free the author structure when it's not needed any more. */ void free_author(this) AUTHOR *this; { total.string1 -= strlen(this->name) + 1; free(this->name); free(this); total.author--; } #if defined(USE_NNTP) && !defined(HAS_USLEEP) usleep(usec) long usec; { # ifndef USELECT if (usec /= 1000000) { sleep((int)usec); } # else struct timeval t; if (usec <= 0) { return; } t.tv_usec = usec % 1000000; t.tv_sec = usec / 1000000; (void) select(1, 0, 0, 0, &t); # endif } #endif