Skip to content

Commit

Permalink
FEAT: improved the speed of the soundex code and fixed it to produc…
Browse files Browse the repository at this point in the history
…e the same results as PHP.
  • Loading branch information
Oldes committed Jul 16, 2024
1 parent 0b0b6ec commit c485c89
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 65 deletions.
1 change: 1 addition & 0 deletions src/boot/sysobj.reb
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ modules: object [
httpd: https://src.rebol.tech/modules/httpd.reb
prebol: https://src.rebol.tech/modules/prebol.reb
scheduler: https://src.rebol.tech/modules/scheduler.reb
soundex: https://src.rebol.tech/modules/soundex.reb
spotify: https://src.rebol.tech/modules/spotify.reb
thru-cache: https://src.rebol.tech/modules/thru-cache.reb
to-ascii: https://src.rebol.tech/modules/to-ascii.reb
Expand Down
133 changes: 68 additions & 65 deletions src/modules/soundex.reb
Original file line number Diff line number Diff line change
@@ -1,78 +1,81 @@
REBOL [
Title: "Soundex"
Date: 16-Jul-2024
File: %soundex.r
Author: "Allen Kamp, Oldes"
Purpose: {Soundex Encoding returns similar codes for similar sounding words or names. eg Stephens, Stevens are both S315, Smith and Smythe are both S53. Useful for adding Sounds-like searching to databases}
Comment: {
This is the basic Soundex algorithm (There are a number of different
one floating around)
Title: "Soundex"
Date: 16-Jul-2024
File: %soundex.reb
Author: "Allen Kamp, Oldes"
Purpose: {Soundex Encoding returns similar codes for similar sounding words or names. eg Stephens, Stevens are both S315, Smith and Smythe are both S53. Useful for adding Sounds-like searching to databases}
Comment: {
This is the basic Soundex algorithm: https://en.wikipedia.org/wiki/Soundex
1. Remove vowels, H, W and Y
2. Encode each char with its code value
3. Remove adjacent duplicate numbers
1. Remove vowels, H, W and Y
2. Encode each char with its code value
3. Remove adjacent duplicate numbers
4. Return First letter, followed by the next 3 letter's code
numbers, if they exist.
4. Return First letter, followed by the next 3 letter's code
numbers, if they exist.
TODO: Other algorithms: Extended Soundex, Metaphone and the LC Cutter table
}
Language: "English"
Email: %allenk--powerup--com--au
library: [
level: 'intermediate
platform: 'all
type: 'tool
domain: [DB text text-processing]
tested-under: none
support: none
license: none
see-also: none
]
Version: 1.1.0
Type: module
Exports: [soundex]
Needs: 3.0.0
History: [
17-Jul-1999 @Allen "Initial version"
16-Jul-2024 @Oldes "Ported to Rebol3"
TODO: Other algorithms: Extended Soundex, Metaphone and the LC Cutter table
}
Version: 2.0.0
Type: module
Name: soundex
Exports: [soundex]
Needs: 3.0.0
History: [
17-Jul-1999 @Allen "Initial version"
16-Jul-2024 @Oldes "Ported to Rebol3"

]
]
]

soundex: function/with [
{Returns the Census Soundex Code for the given string}
string [any-string!] "String to Encode"
{Returns the Census Soundex Code for the given string}
string [any-string!] "String to Encode"
][
code: make string! 4
prev: none

code: make string! ""
if empty? string [return "0000"]

either all [string? string string <> ""] [
string: uppercase trim copy string

foreach letter string [
parse to string! letter [soundex-match | soundex-no-match]
if 4 = length? code [break] ;maximum length for code is 4
]
] [
return string ; return unchanged
]
change code first string ; replace first number with first letter
return code
foreach letter string [
either val: mapping/:letter [
if val != prev [append code val prev: val]
][
if find "aeiouhwy" letter [prev: #" "]
if empty? code [append code #"0"]
]
if 4 = length? code [break] ;maximum length for code is 4
]
change code uppercase first string
pad/with code 4 #"0"
code
][
code: val: none
; Create Rules
set1: [[#"B" | #"F" | #"P" | #"V"](val: #"1")]
set2: [[#"C" | #"G" | #"J" | #"K" | #"Q" | #"S" | #"X" | #"Z"](val: #"2")]
set3: [[#"D" | #"T"](val: #"3")]
set4: [[#"L"](val: "4")]
set5: [[#"M" | #"N"] (val: #"5")]
set6: [[#"R"](val: #"6")]
; Append val to code if not a duplicate of previous code val
soundex-match: [[set1 | set2 | set3 | set4 | set5 | set6 ]
(if val <> back tail code [append code val]) ]
code: val: prev: none
mapping: make map! [
;Set1
#"B" #"1"
#"F" #"1"
#"P" #"1"
#"V" #"1"
;Set2
#"C" #"2"
#"G" #"2"
#"J" #"2"
#"K" #"2"
#"Q" #"2"
#"S" #"2"
#"X" #"2"
#"Z" #"2"
;Set3
#"D" #"3"
#"T" #"3"
;Set4
#"L" #"4"
;Set5
#"M" #"5"
#"N" #"5"
;Set6
#"R" #"6"
]
]

; If letter not a matched letter its val is 0, but we only care
; about it if it is the first letter.
soundex-no-match: [(if (length? code) = 0 [append code "0"])]
]
22 changes: 22 additions & 0 deletions src/tests/test-soundex.r3
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Rebol [
Title: "Test Soundex function"
Date: 16-Jul-2024
Author: "Oldes"
File: %test-soundex.r3
Version: 1.0.0
]
use [tmp][
tmp: none
foreach [code name] [
"R163" "Robert"
"R163" "Rupert"
"R150" "Rubin"
"A226" "Ashcraft"
"A226" "Ashcroft"
"T522" "Tymczak" ;; the chars 'z' and 'k' in the name are coded as 2 twice since a vowel lies in between them
"P236" "Pfister"
"H555" "Honeyman"
][
printf [5 9 5] reduce [code name tmp: soundex name code == tmp]
]
]

0 comments on commit c485c89

Please sign in to comment.