Rebol

Scripts

Animation
CGI
Compression
Console
Database
Debug
Dialects
Dialogs
Editor
Encryption
Extension
External Library
File
File Handling
Files
Financial
FTP
Game
Games
Graphics
GUI
HTML
HTTP
Internet
LDC
Markup
Math
Module
Network
Networking
None
Other - Net
Parse
Patch
Printing
Protocol
Rebol
Scheme
Scientific
SDK
Security
Shell
Sound
SQL
TCP
Testing
Text
Text Processing
User Interface
Util
Utility
VID
Visualization
Web
Win API
X-File
XML
Wiki

Download Script View Documentation
REBOL [
	Title: "Similarity Metrics"
	File: %simetrics.r
	Date: 19-Feb-2006 
	Purpose: "Toolkit of string distance metrics."
	Version: 0.5.0
	Library: [
		Level: 'intermediate
		Platform: 'all
		Type: [function module tool]
		Domain: [ai math parse scientific text text-processing]
		tested-under: [
			core 2.5.6.3.1 on [WXP] "fvz"
		]
        Support: none
        License: 'bsd
        See-also: none
	]
	Author: "Francois Vanzeveren (fvz)"
	History: [
		0.5.0 [19-Feb-2006 {
			Improved interface for 'accumulate-statistics and 'get-similarity: 
				the corpus statistics are returned to the client and must be passed back
				to 'get-similarity for token-base metrics.
			} "fvz"]
		0.4.1 [19-Feb-2006 {
			- BUG FIX: recursive calls to get-similarity/jaro(-winkler) must be done with 'case and 'strict refinements.
			- Performance improvement: Jaccard metric accepts pre-tokenized source string.
		} "fvz"]
		0.4.0 [18-Feb-2006 {
			- NEW METRIC: Hybrid Jaccard - Jaro
			- NEW METRIC: Hybrid Jaccard - Jaro-Winkler} "fvz"]
		0.3.2 [18-Feb-2006 {
			- Some code improvement for 'sm-multiply 'sm-divide and 'sm-max (thanks to Marco)
			- NEW TOKENIZER: 'tokenize-text
		} "fvz"]
		0.3.1 [18-Feb-2006 {
			- Interface change for hybrid token-based metrics
			- BUG FIX: remove folders in the corpus} "fvz"]
		0.3.0 [17-Feb-2006 {
			NEW METRIC: Term Frequency-Inverse Document Frequency
			NEW METRIC: Term Frequency-Inverse Document Frequency with Jaro (slow!)
			NEW METRIC: Term Frequency-Inverse Document Frequency with Jaro-Winkler (slow!)
		} "fvz"]
		0.2.1 [12-Feb-2006 "BUG FIX: in simetrics/ctx-jaro/get-prefix-length. Thanks to Sunanda from Rebol.org" "fvz"]
		0.2.0 [11-Feb-2006 {
			- NEW METRIC: Jaccard Similarity
			- /case refinement: matching is case sensitive
			- /strict refinement: matching is non-english characters sensitive
		} "fvz"]
		0.1.1 [11-Feb-2006 {
			- The script is now compatible with older versions of rebol (i.e. < 2.6.x)
			- BUG FIX Levenshtein: the subtitution's cost was not properly computed from the deletion and insertion's costs
			- Levenshtein: result normalized
			- NEW METRIC: Jaro and Jaro-Winkler metrics
			- New interface: 'get-similarity is the unique entry point to all metrics
		} "fvz"]
		0.1.0 [10-Feb-2006 "Levenshtein distance implemented" "fvz"]
		0.0.1 [9-Feb-2006 "Created this file" "fvz"]
	]
]

simetrics: context [
	set 'accumulate-statistics function [
		"Accumulates and returns the statistics on the corpus of documents."
		corpus [file! block! hash!] "Path to where the collection of documents (the corpus) is stored, or the corpus itself!"
		tokenize [function!] "Splits a string into tokens."
	] [
		corpus-path
		document-frequency ; maps each word to the number of documents in which it appears
		; maps each document d of the corpus D to the number of times each
		; token/word w appears in d and the weight of w in d
		; The structure is:
		; [
		;	d1 [
		;		w1 [frequency weight]
		;		w2 [frequency weight]
		;		...
		;	]
		;	d2 [
		;		w1 [frequency weight]
		;		w2 [frequency weight]
		;		...
		;	]
		;	...
		; ]
		term-stats
		d-stats
		tokens prev-token
		pointer count corpus-size
		w w-stats w-count w-weight 
		err
	] [
		corpus-path: none
		if file? corpus [
			corpus-path: corpus
			corpus: read corpus-path
			forall corpus [
				either dir? join corpus-path first corpus
					[corpus: back remove corpus ]
					[insert corpus: next corpus none]
			]
			corpus: head corpus
		]
		
		document-frequency: make hash! []
		term-stats: make hash! []
		d-stats: make hash! []
		; The following loop count the frequencies of each token:
		;  * the number of documents in which a token w appears (document-frequency)
		;  * within each document, the frequency of token w (term-stats)
		foreach [doc-name doc] corpus [
			tokens: sort tokenize either doc [copy doc] [read join corpus-path doc-name]
			prev-token: none
			forall tokens [
				either not all [prev-token equal? prev-token first tokens] [
					; increment document frequency counts
					pointer: find document-frequency first tokens
					either pointer [
						count: add 1 first next pointer
						change next pointer count
					] [
						repend document-frequency [first tokens 1]
					]
					if prev-token [append d-stats compose/deep [(prev-token) [(w-count) 0]]]
					w-count: 1
				] [
					w-count: add w-count 1
				]
				prev-token: first tokens
			]
			repend term-stats [doc-name d-stats]
			d-stats: make hash! []
		]
		corpus: head corpus
		
		; The following computes the weight of each token w in each document of the corpus (term-stats)
		corpus-size: divide length? corpus 2
	
		forskip term-stats 2 [
			tokens: second term-stats
			forskip tokens 2 [
				w: first tokens
				w-stats: second tokens
				w-count: first w-stats
				w-weight: multiply 	w-count
						  			log-10 divide corpus-size
						  							select document-frequency w
				change next w-stats w-weight
			]
		]
		term-stats: head term-stats
		return reduce [document-frequency term-stats]
	]
	
	set 'get-similarity function [
		{Measures the similarity of two strings.}
		s [string! block!] "Source string or token multiset. Token multiset is ONLY for Jaccard metric!"
		/jaro 
			{Measures the similarity of two strings based on the 
			number and the order of common characters between them.}
			t-jaro [string!] "Target string"
		/jaro-winkler
			{Variant of the Jaro metric adjusting the weighting for common prefixes.}
			t-jaro-winkler [string!] "Target string"
		/levenshtein
			{Measures the distance (i.e. similarity) between two strings. 
			The distance is the number of edit operations (deletions, 
			insertions, substitutions) required to transform the source 
			string into the target string.}
			t-levenshtein [string!] "Target string"
		/levenstein {See /levenshtein.}
			t-levenstein [string!] "Target string"
		/del-cost dc [number!] "Deletion's cost. ONLY with /levenshtein refinement."
		/ins-cost ic [number!] "Insertion's cost. ONLY with /levenshtein refinement."
		/sub-cost sc [number!] "Substitution's cost. ONLY with /levenshtein refinement."
		/case "Characters are case-sensitive."
		/strict "Matching is non-english characters sensitive."
		
		; Token-Based Metrics
		/jaccard
			{Token based distance function. The Jaccard similarity between 
			word sets S and T is simply |S intersect T| / |S union T|}
			t-jaccard [string!] "Target string"
			tokenize-jaccard [function!] "Splits a string into tokens."
		/tfidf	"Term Frequency-Inverse Document Frequency Metric"
			corpus-stats [block!] "Corpus statistics as build and returned by 'accumulate-statistics."
			tokenize-tfidf [function!] "Splits a string into tokens."
		; Hybrid token-based metrics
		/jaro-hybrid "Hybrid token-based and Jaro-Winkler metric. ONLY with token-based metric."
		/jaro-winkler-hybrid "Hybrid token-based and Jaro-Winkler metric. ONLY with token-based metric."
	] [t tokenize] [
		t: any [t-jaro t-jaro-winkler t-levenshtein t-levenstein t-jaccard]
		tokenize: any [:tokenize-jaccard :tokenize-tfidf]
		; TOKEN-BASED METRICS
		if jaccard [
			use [s-bag t-bag current-score max-score score] [
				s-bag: unique either block? s [s] [tokenize s]
				t-bag: unique tokenize t
				either any [jaro-hybrid jaro-winkler-hybrid] [
					score: 0
					forall s-bag [
						current-score: max-score: 0
						forall t-bag [
							if jaro-hybrid 			[current-score: get-similarity/jaro/case/strict 		first s-bag first t-bag]
							if jaro-winkler-hybrid  [current-score: get-similarity/jaro-winkler/case/strict first s-bag first t-bag]
							max-score: max current-score max-score
						]
						t-bag: head t-bag
						score: add score max-score
					]
					s-bag: head s-bag
					t-bag: head t-bag 
					return divide score max length? s-bag length? t-bag
				] [
					return divide
							length? intersect s-bag t-bag
							length? union s-bag t-bag
				]
			]
		]
		if tfidf [
			use [document-frequency term-stats retval q-tok f-score score 
				tokens j-score w-stats q-tok-scores w-scores d-frequency
			] [
				document-frequency: first corpus-stats
				term-stats: second corpus-stats
				retval: make hash! []
				q-tok: sort unique tokenize s
				q-tok-scores: make hash! []
				if any [jaro-hybrid jaro-winkler-hybrid] [
					d-frequency: document-frequency
					forall q-tok [
						w-scores: make hash! []
						forskip d-frequency 2 [
							if jaro-hybrid 		[j-score: get-similarity/jaro/case/strict first q-tok first d-frequency]
							if jaro-winkler-hybrid [j-score: get-similarity/jaro-winkler/case/strict first q-tok first d-frequency]
							if greater-or-equal? j-score 0.9 [repend w-scores [first d-frequency j-score]]
						]
						repend q-tok-scores [first q-tok w-scores]
						d-frequency: head d-frequency
					]
				]
				f-score: select reduce [
					any [jaro-hybrid jaro-winkler-hybrid] [
						w-scores: select q-tok-scores first q-tok
						forskip w-scores 2 [
							if w-stats: select tokens first w-scores [
								j-score: second w-scores
								score: add score multiply j-score second w-stats
							]
						]
					]
					true [
						w-stats: select tokens first q-tok
						score: add score either w-stats [second w-stats] [0]
					]
				] true
				
				forskip term-stats 2 [
					score: 0
					tokens: second term-stats
					forall q-tok [
						do f-score
					]
					repend retval [score first term-stats]
					q-tok: head q-tok
				]
				term-stats: head term-stats
				return sort/skip/reverse retval 2
			]
		]
		
		; NON TOKEN-BASED METRICS
		trim s: copy any [s ""]
		trim t: copy any [t ""]
		if not case [
			lowercase s
			lowercase t
		]
		if not strict [
			parse/all s [
				any [
					mark: alpha-ext (change mark select/case alpha-map first mark)
					| skip
				]
			]
			parse/all t [
				any [
					mark: alpha-ext (change mark select/case alpha-map first mark)
					| skip
				]
			]
		]
		if jaro [
			use [half-len s-common t-common transpositions] [
				; get half the length of the string rounded up - (this is the distance used for acceptable transpositions)
				half-len: to-integer divide min length? s length? t 2
				; get common characters
				s-common: ctx-jaro/get-common-characters s t half-len
				t-common: ctx-jaro/get-common-characters t s half-len
				
				; Check for empty and/or different size common strings
				if any [
					not-equal? length? s-common length? t-common
					empty? s-common empty? t-common
				] [return 0]
				
				; Get the number of transpositions
				; A transposition for s-common, t-common is a position i 
				; such that s-common[i] <> t-common[i]
				transpositions: 0
				for i 1 length? s-common 1 [
					if not-equal? s-common/:i t-common/:i [transpositions: add transpositions 1]
				]
				transpositions: divide transpositions 2
				return divide 
							add add 
									divide length? s-common length? s
									divide length? t-common length? t
								divide 
									subtract length? s-common transpositions
									length? s-common
							3
			]
		]
		if jaro-winkler [
			use [dist prefix-length] [
				dist: get-similarity/jaro/case/strict s t
				; This extension modifies the weights of poorly matching pairs s, t which share a common prefix
				prefix-length: ctx-jaro/get-prefix-length s t
				return add  dist
							multiply multiply prefix-length ctx-jaro/PREFIXADUSTMENTSCALE 
									 subtract 1 dist
			]
		]
		if any [levenshtein levenstein] [
			use [dist max-len] [
				; 0.1.1
				either any [ins-cost del-cost sub-cost] [
					sc: any [sc sm-multiply sm-max dc ic 2 2]
					dc: any [dc ic sm-divide sc 2]
					ic: any [ic dc]
				] [
					sc: dc: ic: 1
				]
				dist: ctx-levenshtein/get-distance
							back tail s 
							back tail t 
							array reduce [length? t length? s]
							dc ic sc
				; get the max possible levenstein distance score for string
				max-len: max length? s length? t
				if zero? max-len [return 1] ; as both strings identically zero length
				; return actual / possible levenstein distance to get 0-1 range
				subtract 1 divide dist max-len
			]
		]
	]
	sm-divide: func [
		"Returns the first value divided by the second."
		value1 [number! pair! char! money! time! tuple! none!]
		value2 [number! pair! char! money! time! tuple! none!]
	] [
		all [
			value1 value2
			divide value1 value2
		]
	]
	sm-multiply: func [
		"Returns the first value multiplied by the second."
		value1 [number! pair! char! money! time! tuple! none!]
		value2 [number! pair! char! money! time! tuple! none!]
	] [
		all [
			value1 value2
			multiply value1 value2
		]
	]
	sm-max: func [
		"Returns the greater of the two values."
		value1 [number! pair! char! money! date! time! tuple! series! none!]
		value2 [number! pair! char! money! date! time! tuple! series! none!]
	] [
		all [
			value1: any [value1 value2]
			value2: any [value2 value1]
			max value1 value2
		]
	]
	alpha-map: make block! reduce [	
		make char! 131	make char! 102	; ƒ to f
		make char! 138	make char! 83	; Š to S
		make char! 142	make char! 90	; Ž to Z
		make char! 154	make char! 115	; š to s
		make char! 158	make char! 122	; ž to z
		make char! 159	make char! 89	; Ÿ to Y
		make char! 192	make char! 65	; À to A
		make char! 193	make char! 65	; Á to A
		make char! 194	make char! 65	; Â to A
		make char! 195	make char! 65	; Ã to A
		make char! 196	make char! 65	; Ä to A
		make char! 197	make char! 65	; Å to A
		make char! 199	make char! 67	; Ç to C
		make char! 200	make char! 69	; È to E
		make char! 201	make char! 69	; É to E
		make char! 202	make char! 69	; Ê to E
		make char! 203	make char! 69	; Ë to E
		make char! 204	make char! 73	; Ì to I
		make char! 205	make char! 73	; Í to I
		make char! 206	make char! 73	; Î to I
		make char! 207	make char! 73	; Ï to I
		make char! 208	make char! 68	; Ð to D
		make char! 209	make char! 78	; Ñ to N
		make char! 210	make char! 79	; Ò to O
		make char! 211	make char! 79	; Ó to O
		make char! 212	make char! 79	; Ô to O
		make char! 213	make char! 79	; Õ to O
		make char! 214	make char! 79	; Ö to O
		make char! 217	make char! 85	; Ù to U
		make char! 218	make char! 85	; Ú to U
		make char! 219	make char! 85	; Û to U
		make char! 220	make char! 85	; Ü to U
		make char! 221	make char! 89	; Ý to Y
		make char! 224	make char! 97	; à to a
		make char! 225	make char! 97	; á to a
		make char! 226	make char! 97	; â to a
		make char! 227	make char! 97	; ã to a
		make char! 228	make char! 97	; ä to a
		make char! 229	make char! 97	; å to a
		make char! 231	make char! 99	; ç to c
		make char! 232	make char! 101	; è to e
		make char! 233	make char! 101	; é to e
		make char! 234	make char! 101	; ê to e
		make char! 235	make char! 101	; ë to e
		make char! 236	make char! 105	; ì to i
		make char! 237	make char! 105	; í to i
		make char! 238	make char! 105	; î to i
		make char! 239	make char! 105	; ï to i
		make char! 241	make char! 110	; ñ to n
		make char! 242	make char! 111	; ò to o
		make char! 243	make char! 111	; ó to o
		make char! 244	make char! 111	; ô to o
		make char! 245	make char! 111	; õ to o
		make char! 246	make char! 111	; ö to o
		make char! 249	make char! 117	; ù to u
		make char! 250	make char! 117	; ú to u
		make char! 251	make char! 117	; û to u
		make char! 252	make char! 117	; ü to u
		make char! 253	make char! 121	; ý to y
		make char! 255	make char! 121	; ÿ to y
	]
	alpha-ext: make block! []
	forskip alpha-map 2 [
		append alpha-ext first alpha-map
	]
	alpha-map: head alpha-map ; for compatibility with Rebol/Core < 2.6.x
	
	; Charsets
	digit: charset [#"0" - #"9"]
	alpha-ext: charset alpha-ext
	alpha: charset [#"A" - #"Z" #"a" - #"z"]
	alphanum: union alpha digit
	space: charset reduce [#" " newline crlf tab]
	
	ctx-jaro: context [
		;maximum prefix length to use.
		MINPREFIXTESTLENGTH: 6
		
		;prefix adjustment scale.
		PREFIXADUSTMENTSCALE: 0.1
		
		get-common-characters: func [
			{Returns a string of characters from string1 within string2 if they 
			are of a given distance seperation from the position in string1}
			string1 [string!]
			string2 [string!]
			distance-sep [integer!]
			/local return-commons pos str
		] [
			; create a return string of characters
			return-commons: copy ""
			; create a copy of string2 for processing
			string2: copy string2
			; iterate over string1
			forall string1 [
				if found? str: find/part 
							at string2 add 1 pos: subtract index? string1 distance-sep
							first string1
							subtract 
								add multiply distance-sep 2 min pos 0
								1 
				[
					; append character found
					append return-commons first string1
					; alter copied string2 for processing
					change/part str to-char 0 1
				]
				string2: head string2
			]
			return-commons
		]
		get-prefix-length: func [
			"Returns the prefix length found of common characters at the begining of the strings."
			string1 [string!]
			string2 [string!]
			/local n
		] [
			n: first minimum-of reduce [MINPREFIXTESTLENGTH length? string1 length? string2]
			for i 1 n 1 [
				; check the prefix is the same so far
				if not-equal? string1/:i string2/:i [
					; not the same so return as far as got
					return subtract i 1
				]
			]
			; 0.2.1
			return n ; first n characters are the same
		]
	]
	ctx-levenshtein: context [
		get-distance: function [
			s [string!] "Source string"
			t [string!] "Target string"
			m [block!]
			dc [number!] "Deletion's cost"
			ic [number!] "Insertion's cost"
			sc [number!] "Substitution's cost"
		] [
			letter-copy letter-substitute 
			letter-insert letter-delete
			i j 
		] [
			if empty? head s [return length? head t]
			if empty? head t [return length? head s]
; 0.1.1
;			if m/(index? t)/(index? s) [return m/(index? t)/(index? s)]
			j: index? t
			i: index? s
			if m/:j/:i [return m/:j/:i]	
			letter-copy: letter-substitute: letter-insert: letter-delete: 1E+99
			; Copy t[j] to s[i]
			if equal? first s first t [
				letter-copy: do select reduce [
					all [head? s head? t] [0]
					true [get-distance back s back t m dc ic sc]
				] true
			]
			; Substitute t[j] for s[i]
			letter-substitute: add sc do select reduce [
					all [head? s head? t] [0]
					head? s [subtract index? t 1]
					head? t [subtract index? s 1]
					true [get-distance back s back t m dc ic sc]
				] true
			; Insert the letter t[j]
			letter-insert: add ic do select reduce [
				head? t [index? s]
				true [get-distance s back t m dc ic sc]
			] true
			; Delete the letter s[i]
			letter-delete: add dc do select reduce [
				head? s [index? t]
				true [get-distance back s t m dc ic sc]
			] true
; 0.1.1
;			m/(index? t)/(index? s): first minimum-of reduce [letter-copy letter-substitute letter-insert letter-delete]
			poke m/:j i first minimum-of reduce [letter-copy letter-substitute letter-insert letter-delete]
			m/:j/:i
		]
	]
]

; Sample tokenizers
tokenize-rebol-script: func [
	"Converts a string to a token multiset (where each token is a word)."
	str [string!]
	/local rebol-punctuation tokens t-alpha t-digit char mark
			space alpha-ext alpha digit
] [
	space: 		simetrics/space
	alpha-ext: 	simetrics/alpha-ext
	alpha:		simetrics/alpha
	digit:		simetrics/digit
	rebol-punctuation: charset "-!?~"
	
	tokens: make block! []
	t-alpha: copy ""
	t-digit: copy ""
	parse/all str [
		any [ "64#{" thru "}" |
			copy char rebol-punctuation (
				if not empty? t-alpha [append t-alpha char]	
				if not empty? t-digit [
					append tokens t-digit
					t-digit: copy ""
				]
			) |
			copy char space (
				if not empty? t-alpha [
					append tokens t-alpha
					t-alpha: copy ""
				]
				if not empty? t-digit [
					append tokens t-digit
					t-digit: copy ""
				]
			) |
			copy char alpha-ext (
				if not empty? t-digit [
					append tokens t-digit
					t-digit: copy ""
				]
				lowercase char
				char: select/case simetrics/alpha-map first char
				append t-alpha char
			) | 
			copy char alpha (
				if not empty? t-digit [
					append tokens t-digit
					t-digit: copy ""
				]
				lowercase char
				append t-alpha char
			) | 
			copy char digit (
				either not empty? t-alpha [
					append t-alpha char
				] [
					append t-digit char
				]
			) | 
			mark: (
				if not empty? t-alpha [
					append tokens t-alpha
					t-alpha: copy ""
				]
				if not empty? t-digit [
					append tokens t-digit
					t-digit: copy ""
				]
			) skip
		]
	]
	return tokens
]

tokenize-rebol-script-query: func [
	"Converts a string to a token multiset (where each token is a word)."
	str [string!]
	/local tokens
] [
	tokens: tokenize-rebol-script str
	
	return tokens
]

tokenize-text: func [
	str [string!]
	/local rebol-punctuation tokens t-alpha t-digit char mark
			space alpha-ext alpha digit
] [
	space: 		simetrics/space
	alpha-ext: 	simetrics/alpha-ext
	alpha:		simetrics/alpha
	digit:		simetrics/digit
	
	tokens: make block! []
	t-alpha: copy ""
	t-digit: copy ""
	parse/all str [
		any [
			copy char space (
				if not empty? t-alpha [
					append tokens t-alpha
					t-alpha: copy ""
				]
				if not empty? t-digit [
					append tokens t-digit
					t-digit: copy ""
				]
			) |
			copy char alpha-ext (
				if not empty? t-digit [
					append tokens t-digit
					t-digit: copy ""
				]
				lowercase char
				char: select/case simetrics/alpha-map first char
				append t-alpha char
			) | 
			copy char alpha (
				if not empty? t-digit [
					append tokens t-digit
					t-digit: copy ""
				]
				lowercase char
				append t-alpha char
			) | 
			copy char digit (
				either not empty? t-alpha [
					append t-alpha char
				] [
					append t-digit char
				]
			) | 
			mark: (
				if not empty? t-alpha [
					append tokens t-alpha
					t-alpha: copy ""
				]
				if not empty? t-digit [
					append tokens t-digit
					t-digit: copy ""
				]
			) skip
		]
	]
	return tokens
]