C/C++ Interactive Guide

home *** CD-ROM | disk | FTP | other *** search

/ C/C++ Interactive Guide / c-cplusplus-interactive-guide.iso / c_ref / csource4 / 248_01 / matches.c < prev next >

Wrap

Text File | 1989-08-18 | 12KB | 487 lines

/* ** name: matches.c ** purpose: Words that almost match pattern searching ** source: Computer Language - November 1986 ** by Dave Taylor ** coded: 1988.11.07 - Roberto Artigas Jr ** NOTES: This set of routines comes from a number of ** sources, all I did is integrate them into a ** subroutine that uses all the methods for ** text matches. */ #define DEBUG 0 /* 0 = NO debug, 1 = YES debug */ #define TEST 0 /* 0 = library, 1 = mainline test */ #include <stdio.h> #include <stdlib.h> #include <ctype.h> #include <string.h> #ifdef tolower #undef tolower #endif #define tolower(c) (isupper(c) ? c - 'A' + 'a' : c) #ifdef min #undef min #endif #define min(a,b) ((a) < (b) ? (a) : (b)) /* ** Defined constants */ #define WORD_LENGTH 40 #define THRESHOLD 0.500 /* PF474 */ #define THRESHOLD2 0.750 /* GESTALT */ #define SOUNDEX_LENGTH 4 /* soundex */ #ifndef TRUE #define TRUE 1 #define FALSE 0 #endif /* ** For the soundex algorithm: 'x' means 'delete this letter' */ #define index_of(ch) (ch - 'A') #define IGNORE 'x' static char soundex_mappings[] = { "x123x12xx22455x12623x1x2x2" }; /* ** Routine prototypes */ char *remove_duplicate_letters(); char *translate_case(); char *soundex_translation_of(); char *reverse(); double pf474_algorithm(); /* ** name: reverse ** purpose: Reverse the word and return it */ char *reverse(word) char *word; { static char buffer[WORD_LENGTH]; register int i, j = 0; for (i=strlen(word)-1; i > -1; i--) buffer[j++] = word[i]; buffer[j] = '\0'; return((char *)buffer); } /* ** name: translate_case ** purpose: Destructively translate the word to all lowercase ** returning both the altered word and a pointer to ** the altered word */ char *translate_case(word) char *word; { register int index; for (index = 0; index < strlen(word); index++) word[index] = tolower(word[index]); return((char *)word); } /* ** name: remove_duplicate_letters ** purpose: This routine removes all multiply occurring ** letters in the given word and returns a ** 'sanitized' version of the word. For example, ** given 'missionary' it returns 'misionary'. */ char *remove_duplicate_letters(word) char *word; { register int index, bindex = 0; static char buffer[WORD_LENGTH] = { "" }; buffer[bindex++] = word[0]; for (index = 1; index < strlen(word); index++) if (word[index-1] != word[index]) buffer[bindex++] = word[index]; buffer[bindex] = 0; return((char *)buffer); } /* ** name: check_transpositions ** purpose: Transposes character pairs in word to see if ** any match the given patter. Returns TRUE if ** so, FALSE if not */ int check_transpositions(word,pattern) char *word; char *pattern; { register int index; char spare_character; for (index=1; index < strlen(word); index++) { spare_character = word[index]; word[index] = word[index-1]; word[index-1] = spare_character; if (strcmp(word,pattern) == 0) return(TRUE); word[index-1] = word[index]; word[index] = spare_character; } return(FALSE); } /* ** name: allow_a_missing_character ** purpose: This routine returns TRUE if word is the ** same as pattern, just missing a single letter. ** 1988.10.07 Roberto Artigas Jr - If word and pattern ** is a length of one and not an exact match exit ** saying that it is false */ int allow_a_missing_character(word, pattern) char *word; char *pattern; { register int index, windex = 0; if ((strlen(word)==1) && (strlen(pattern)==1) && (*word != *pattern)) return(FALSE); if (strlen(word) + 1 != strlen(pattern)) return(FALSE); for (index=0, windex=0; index < strlen(pattern); index++) if (word[windex] == pattern[index]) windex++; else if (windex < index) return(FALSE); if (word[windex] == pattern[index]) return(TRUE); else return(FALSE); } /* ** name: allow_an_incorrect_character ** purpose: This routine returns TRUE if word is the same ** as pattern, just with a single incorrect letter. ** 1988.10.07 Roberto Artigas Jr - If word and pattern ** is a length of one and not an exact match exit ** saying that it is false */ int allow_an_incorrect_character(word,pattern) char *word; char *pattern; { register int index, windex; register int already_seen_bad_letter = FALSE; if (strlen(word) != strlen(pattern)) return(FALSE); if ((strlen(word)==1) && (strlen(pattern)==1) && (*word != *pattern)) return(FALSE); for (index=0, windex=0; index < strlen(pattern); index++) { if (word[windex] == pattern[index]) windex++; else if (already_seen_bad_letter) return(FALSE); else { already_seen_bad_letter++; windex++; } } return(TRUE); } /* ** name: soundex_translation_of ** purpose: Implement soundex algorithm tranlation */ char *soundex_translation_of(buffer) char *buffer; { static char result[WORD_LENGTH]; register int i, j, ch, newch; result[0] = toupper(buffer[0]); result[1] = result[2] = result[3] = '0'; result[4] = '\0'; for (i=1, j=1; i < strlen(buffer) && j < SOUNDEX_LENGTH; i++) { ch = toupper(buffer[i]); if ((newch = soundex_mappings[index_of(ch)]) != IGNORE) if (result[j-1] != (char)newch) if (!(j == 1 && ch == result[0])) result[j++] = (char)newch; } return((char *)result); } /* ** name: soundex ** puspose: Implements matching based on the soundex ** algorithm - this simple part gets the two ** words 'Soundex'ized then returns a nonzero ** if the two match on this technique. */ int soundex(word,pattern) char *word; char *pattern; { char word2[WORD_LENGTH]; char pattern2[WORD_LENGTH]; strcpy(word2,soundex_translation_of(word)); strcpy(pattern2,soundex_translation_of(pattern2)); return((strcmp(word2,pattern2)==0)); } /* ** name: proximity_value ** purpose: This actually implements the algorithm used ** in the PF474 pattern matching chip from ** Proximity technologies, returning the number ** of matches encountered. */ int proximity_value(word,pattern) char *word; char *pattern; { int matches = 0, count = 1; int iteration, index, len; if (strcmp(word,pattern) == 0) { for (count=0; count <= strlen(word); count++) matches += (2*count); return(matches); } len = min(strlen(word),strlen(pattern)) + 1; count =1; do { for (iteration=1; iteration<len; iteration++) for (index=0; index<iteration; index++) if (tolower(word[index]) == tolower(pattern[index])) matches++; strcpy(word,reverse(word)); strcpy(pattern,reverse(pattern)); } while (count++ < 2); return(matches); } /* ** name: pf474_algorithm ** purpose: This is the top-level PF474 pattern matching ** routines. It implements the equation we are ** using */ double pf474_algorithm(word,pattern) char *word; char *pattern; { if (strlen(word) == 0 && strlen(pattern) == 0) return((double)1.0); return((double) (proximity_value(word,pattern) / (proximity_value(word,word)+proximity_value(pattern,pattern)))); } /* ** name: GCsubstr ** purpose: Recursive greatest common sub-string */ int GCsubstr(st1, end1, st2, end2) char *st1; char *end1; char *st2; char *end2; { register char *a1, *b1, *s1, *a2, *b2, *s2; short max, i; if (end1 <= st1) /* s1 empty */ return (0); if (end2 <= st2) /* s2 empty */ return (0); if (end1 == st1 + 1) /* s1 has one char */ if (end2 == st2 + 1) /* and s2 has one char */ return (0); /* They cannot be equal */ max = 0; b1 = end1; b2 = end2; for (a1 = st1; a1 < b1; a1++) for (a2 = st2; a2 < b2; a2++) if (*a1 == *a2) { /* How long common sub-string? */ for (i = 1; a1[i] && (a1[i] == a2[i]); i++) ; if (i > max) { max = i; s1 = a1; s2 = a2; b1 = end1 - max; b2 = end2 - max; } } if (! max) return (0); max += GCsubstr(s1 + max, end1, s2 + max, end2); /* RHS */ max += GCsubstr(st1, s1, st2, s2); /* LHS */ return(max); } /* ** name: simil ** purpose: Best guess matches ** author: Ratcliff