home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Simtel MSDOS 1992 December
/
simtel1292_SIMTEL_1292_Walnut_Creek.iso
/
msdos
/
txtutl
/
gestalt.arc
/
SIMIL_A.ASM
< prev
next >
Wrap
Assembly Source File
|
1988-12-06
|
13KB
|
339 lines
;From the article "Pattern Matching - The Gestalt Approach"
;by John W. Ratcliff and David E. Metzener
;Dr. Dobbs Journal, July 1988
; This version is for use in assembly language programs.
; See TESTSIM.ASM for a demonstration.
;
; - All required local variables are right here in this procedure.
; - Enter with DS:SI = pointer to string 1,
; DS:DI = pointer to string 2
; Preserves all registers except AX (which returns the similarity)
; It is CRITICAL that SS=CSeg! Else the bp referencing will not work!
; I have no intentions of hacking it for other configurations ..
; that's left as an exercise for the student.
; (Hint: Look in the Turbo Pascal version for an example.)
;
; David Kirschbaum
; Toad Hall
; kirsch@braggvax.ARPA
;TITLE SIMIL.ASM written by John W. Ratcliff and David E. Metzener
; November 10, 1987
; Uses the Ratcliff/Obershelp pattern recognition algorithm.
; The function SIMIL returns a percentage value corresponding to how
; alike any two strings are. Be certain to upper case the two strings
; passed if you are not concerned about case sensitivity.
_Simil_a proc near
;TH Data moved down here so it'll be completely contained in our procedure.
ststr1L dw 25 dup(?) ;contains lefts for string 1
ststr1R dw 25 dup(?) ;contains rights for string 1
ststr2L dw 25 dup(?) ;contains lefts for string 2
ststr2R dw 25 dup(?) ;contains rights for string 2
stcknum dw ? ;number of elements on the stack
score dw ? ;the # of chars in common times 2
total dw ? ;total # of chars in string 1 and 2
cL1 dw ? ;left of string 1 found in common
cR1 dw ? ;right of string 1 found in common
cL2 dw ? ;left of string 2 found in common
cR2 dw ? ;right of string 2 found in common
s2ed dw ? ;the end of string 2 used in compare
_simil:
; This routine expects pointers passed in two character strings, null
; terminated, that you wish compared. It returns a percentage value
; from 0 to 100% corresponding to how alike the two strings are.
; Usage: DS:SI = pointer to string 1 TH
; DS:DI = pointer to string 2 TH
; The similarity routine is composed of three major components
; pushst --- pushes a string section to be compared on the stack
; popst --- pops a string section to be examined off of the stack
; compare --- finds the largest group of characters in common between
; any two string sections
; The similarity routine begins by computing the total length of both
; strings passed and placing that value in TOTAL. It then takes
; the beginning and ending of both strings passed and pushes them on
; the stack. It then falls into the main line code.
; The original two strings are immediately popped off of the stack and
; are passed to the compare routine. The compare routine will find the
; largest group of characters in common between the two strings.
; The number of characters in common is multiplied times two and added
; to the total score. If there were no characters in common, then there
; is nothing to push onto the stack. If there are exactly one character
; to the left in both strings, then we needn't push it on the stack.
; (We already know they aren't equal from the previous call to compare.)
; Otherwise the characters to the left are pushed onto the stack. These
; same rules apply to characters to the right of the substring found in
; common. This process of pulling substrings off of the stack, comparing
; them, and pushing remaining sections on the stack is continued until
; the stack is empty. On return the total score is divided by the
; number of characters in both strings. This is multiplied times 100 to
; yield a percentage. This percentage similarity is returned to the
; calling procedure.
push bp ;save BP reg.
mov bp,sp ;save SP reg in BP for use in program
push bx ;save all regs we're gonna trash TH
push cx
push dx
push si
push di
push ES ;save the ES seg register
mov ax,DS ;copy DS segment reg to ES
mov ES,ax
xor ax,ax ;zero out AX for clearing of SCORE var
mov score,ax ;zero out SCORE
mov stcknum,ax ;initialize number of stack entries to 0
;TH DS:SI and DS:DI already point to string 1 and string 2 respectively.
;TH mov si,[bp+4] ;move beginning pointer of string 1 to SI
;TH mov di,[bp+6] ;move beginning pointer of string 2 to SI
cmp [si],al ;is it a null string?
je StrErr ;can't process null strings
cmp [di],al ;is it a null string?
jne DoCmp ;Neither is a null string, so process them
StrErr: jmp Donit ;exit routine
DoCmp: push di ;save DI because of SCAS opcode
push si ;save SI because of SCAS opcode
xor al,al ;clear out AL to search for end of string
cld ;set direction flag forward
mov cx,-1 ;make sure we repeat the correct # of times
repnz scasb ;scan for string delimiter in string 2
dec di ;point DI to '$00' byte of string 2
dec di ;point DI to last char of string 2
mov bp,di ;move DI to BP where it is supposed to be
pop di ;restore SI into DI for SCAS (string 1)
repnz scasb ;scan for string delimiter in string 1
not cx ;do one's complement for correct length of st
sub cx,2 ;subtract the two zero bytes at the end of st
mov total,cx ;store string 2's length
dec di ;point DI to '$00' byte of string 1
dec di ;point DI to last char of string 1
mov bx,di ;move DI to BX where it is supposed to be
pop di ;restore DI to what it should be
call PushSt ;push values for the first call to SIMILIARITY
Main: cmp stcknum,0 ;is there anything on the stack?
je Done ;no, then all done!
call PopSt ;get regs set up for a compare call
call Compare ;do compare for this substring set
or dx,dx ;if nothing in common then nothing to push TH
jz Main ;try another set TH
shl dx,1 ;*2 for add to score
add score,dx ;add into score
mov bp,stcknum ;get number of entry I want to look at
shl bp,1 ;get AX ready to access string stacks
mov si,[ststr1L+bp] ;move L1 into SI or L1
mov bx,cL1 ;move CL1 into BX or R1
mov di,[ststr2L+bp] ;move L2 into DI or L2
mov cx,cL2 ;move CL2 into CX temporarily
mov ax,[ststr1R+bp] ;get old R1 off of stack
mov cL1,ax ;place in CL1 temporarily
mov ax,[ststr2R+bp] ;get old R2 off of stack
mov cL2,ax ;save in CL2 temporarily
mov bp,cx ;place CL2 into BP
cmp bx,si ;compare CL1 to L1
je ChRght ;if zero, then nothing on left side string 1
cmp bp,di ;compare CL2 to L2
je ChRght ;if zero, then nothing on left side string 2
dec bx ;point to last part of left side string 1
dec bp ;point to last part of left side string 2
cmp bx,si ;only one char to examine?
jne PushIt ;no->we need to examine this
cmp bp,di ;only one char in both?
je ChRght ;nothing to look at if both only contain 1 char
PushIt: call PushSt ;push left side on stack
ChRght: mov si,cR1 ;move CR1 into SI or L1
mov bx,cL1 ;move R1 into BX or R1
mov di,cR2 ;move CR2 into DI or L2
mov bp,cL2 ;move R2 into BP or R2
cmp si,bx ;compare CR1 to R1
je Main ;If zero, then nothing on right side string 1
cmp di,bp ;compare CR2 to R2
je Main ;if zero, then nothing on right side string 2
inc si ;point to last part of right side string 1
inc di ;point to last part of right side string 2
cmp bx,si ;only one char to examine?
jne Push2 ;no-> examine it
cmp bp,di ;only 1 char to examine in both?
je Main ;yes-> get next string off of stack
Push2: call PushSt ;push right side on stack
jmp short Main ;do next level of compares
Done: mov ax,score ;get score into AX for MUL
or ax,ax ;anything? TH
jz Donit ;nope, forget this mess TH
mov cx,100 ;get 100 into CX for MUL
mul cx ;multiply by 100
mov cx,total ;get total chars for divide
div cx ;divide by total
Donit:
pop ES ;restore ES to entry value
pop di ;and all other regs also TH
pop si
pop dx
pop cx
pop bx
pop bp ;restore BP back to entry value
ret ;leave with AX holding % similarity
Compare:
; The compare routine locates the largest group of characters between string 1
; and string 2. This routine assumes that the direction flag is clear.
; Pass to this routine:
; BX = R1 (right side of string 1)
; DS:SI = L1 (left side of string 1)
; ES:Di = L2 (left side of string 2)
; BP = R2 (right side of string 2)
;
; This routine returns:
; DX = # of characters matching
; CL1 = Left side of first string that matches
; CL2 = Left side of second string that matches
; CR1 = Right side of first string that matches
; CR2 = Right side of second string that matches
;
; The compare routine is composed of two loops, an inner and an outer.
; The worst case scenario is that there are absolutely no characters in
; common between string 1 and string 2. In this case N x M compares are
; performed. However, when an equal condition occurs in the inner
; loop, then the next character to be examined in string 2 (for this loop)
; is advanced by the number of characters found equal. Whenever a new
; maximum number of characters in common is found, then the ending location
; of both the inner and outer loop is backed off by the difference between
; the new max chars value and the old max chars value for both loops. In
; short, if 5 characters have been found in common part of the way through
; the search, then we can cut our search short 5 characters before the
; true end of both strings since there is no chance of finding better than
; a 5 character match at that point. This technique means that an exact
; equal match will require only a single compare and combinations of other
; matches will proceed as efficiently as possible.
mov s2ed,bp ;store end of string 2
xor dx,dx ;init MAXCHARS
ForL3: push di ;save start of string 2
ForL4: push di ;save start of string 2
push si ;save start of string 1
mov cx,s2ed ;set up for calc of length of string 1
sub cx,di ;get length of string 1 -1
inc cx ;make proper length
push cx ;save starting length of string 1
repz cmpsb ;compare strings
jz Equal ;if equal, then skip fixes
inc cx ;inc back because CMPS decs even if not equal
Equal: pop ax ;get starting length of string 1
sub ax,cx ;get length of common characters
jnz NewMax ;more than 0 chars matched
pop si ;get back start of string 1
pop di ;get back start of string 2
Reent: inc di ;do the next char no matter what
Reent2: cmp di,bp ;are we done with string 2?
jle ForL4 ;no, then do next string compare
pop di ;get back start of string 2
inc si ;next char in string 1 to scan
cmp si,bx ;are we done with string 1?
jle ForL3 ;no, then do next string compare
ret ;MAXCHARS is in DX
; We branch downwards for both newmax and newmx2 because on the
; 8086 line of processors a branch not taken is much faster than
; one which is. Since the not equal condition is to be found
; most often and we would like the inner loop to execute as quickly
; as possible, we branch outside of this loop on the less frequent
; occurrence. When a match or a new MAXCHARS is found, we branch down
; to these two routines, process the new conditions, and then branch
; back up to the main line code.
NewMax:
;TH: we pop SI and DI in any case
pop si ;get back start of string 1 TH
pop di ;get back start of string 2 TH
cmp ax,dx ;greater than MAXCHARS?
jg NewMx2 ;yes, update new MAXCHARS and pointers
;TH pop si ;get back start of string 1
;TH pop di ;get back start of string 2
add di,ax ;skip past matching chars
jmp short Reent2 ;reenter main loop
NewMx2:
;TH pop si ;get back start of string 1
;TH pop di ;get back start of string 2
mov cL1,si ;put begin of match of string 1
mov cL2,di ;put begin of match of string 2
mov cx,ax ;save new MAXCHARS
sub ax,dx ;get delta for adjustment to ends of strings
sub bx,ax ;adjust end of string 1
sub bp,ax ;adjust end of string 2
mov dx,cx ;new MAXCHARS
dec cx ;set up for advance to last matching char
add di,cx ;advance to last matching char string 2
mov cR2,di ;put end of match of string 2
add cx,si ;advance to last matching char string 1
mov cR1,cx ;put end of match of string 1
jmp short Reent ;reenter inner loop
PushSt:
; On Entry:
; BX = R1 (right side of string 1)
; DS:SI = L1 (left side of string 1)
; ES:Di = L2 (left side of string 2)
; BP = R2 (right side of string 2)
mov cx,bp ;save R2
mov bp,stcknum
shl bp,1 ;*2 for words
mov [bp+ststr1L],si ;put left side of string 1 on stack
mov [bp+ststr1R],bx ;put right side of string 1 on stack
mov [bp+ststr2L],di ;put left side of string 2 on stack
mov [bp+ststr2R],cx ;put right side of string 2 on stack
inc stcknum ;add one to number of stack entries
mov bp,cx ;restore R2
ret
PopSt:
; BX = R1 (right side of string 1)
; DS:SI = L1 (left side of string 1)
; ES:Di = L2 (left side of string 2)
; BP = R2 (right side of string 2)
dec stcknum ;point to last entry on stack
mov bp,stcknum ;get number of stack entries
shl bp,1 ;*2 for words
mov si,[bp+ststr1L] ;restore left side of string 1 from stack
mov bx,[bp+ststr1R] ;restore right side of string 1 from stack
mov di,[bp+ststr2L] ;restore left side of string 2 from stack
mov bp,[bp+ststr2R] ;restore right side of string 2 from stack
ret
_Simil_a endp