From 75d96dc7d2c0406dda7a4a3d4bba931e9535b9e3 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 15 Nov 2013 15:11:46 -0500 Subject: [PATCH] levenshtein: half substitution penalty if same case We tweak the penalties so that if they differ only by their casing, then the substitution penalty is halved. Example of the effect: Equal weights: SYS_close --> SYSC_close, SyS_close, SYSC_clone, SyS_clone, con_close Half penalty for diff casing SYS_close --> SyS_close, SYSC_close, SyS_clone, sys_close, SYSC_clone The 'SYSC' versions rank lower. But more importantly, we see 'sys_close' become higher rank than 'con_close'. --- util.cxx | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/util.cxx b/util.cxx index b2c6df053..32ea329fe 100644 --- a/util.cxx +++ b/util.cxx @@ -1120,10 +1120,16 @@ levenshtein(const string& a, const string& b) if (a[i-1] == b[j-1]) // match d(i,j) = d(i-1, j-1); else // penalties open for adjustments - d(i,j) = min(min( - d(i-1,j-1) + 1, // substitution - d(i-1,j) + 1), // deletion - d(i,j-1) + 1); // insertion + { + unsigned subpen = 2; // substitution penalty + // check if they are upper/lowercase related + if (tolower(a[i-1]) == tolower(b[j-1])) + subpen = 1; // half penalty + d(i,j) = min(min( + d(i-1,j-1) + subpen, // substitution + d(i-1,j) + 2), // deletion + d(i,j-1) + 2); // insertion + } } } -- 2.43.5