Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Soundex for Arabic text #73

Merged
merged 3 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions pyarabic/soundex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/python
# -*- coding=utf-8 -*-
"""
soundex
Utility functions used by to encode homophones to the same representation so that they can be matched despite minor differences in spelling (Method was imitated with the one in phpAr)
@author: Odai Alghamdi <odai-alghamdi at outlook dot com>
@author: Odai Alghamdi
@contact: Odai Alghamdi at outlook dot com
@copyright: Arabtechies, Arabeyes, Taha Zerrouki
@license: GPL
@date:2023/10/31
@version:0.3
"""

SOUNDEX_CODE = {
u'\u0627' : "0",
u'\u0648' : "0",
u'\u064a' : "0",
u'\u0639' : "0",
u'\u062d' : "0",
u'\u0647' : "0",
u'\u0641' : "1",
u'\u0628' : "1",
u'\u062e' : "2",
u'\u062c' : "2",
u'\u0632' : "2",
u'\u0633' : "2",
u'\u0635' : "2",
u'\u0638' : "2",
u'\u0642' : "2",
u'\u0643' : "2",
u'\u063a' : "2",
u'\u0634' : "2",
u'\u062a' : "3",
u'\u062b' : "3",
u'\u062f' : "3",
u'\u0630' : "3",
u'\u0636' : "3",
u'\u0637' : "3",
u'\u0629' : "3",
u'\u0644' : "4",
u'\u0645' : "5",
u'\u0646' : "5",
u'\u0631' : "6",
}

SOUNDEX_TRANSLATION = {
u'\u0627' : "A", # أ
u'\u0628' : "B", # ب
u'\u062a' : "T", # ت
u'\u062b' : "T", # ث
u'\u062c' : "J", # ج
u'\u062d' : "H", # ح
u'\u062e' : "K", # خ
u'\u062f' : "D", # د
u'\u0630' : "Z", # ذ
u'\u0631' : "R", # ر
u'\u0632' : "Z", # ز
u'\u0633' : "S", # س
u'\u0634' : "S", # ش
u'\u0635' : "S", # ص
u'\u0636' : "D", # ض
u'\u0637' : "T", # ط
u'\u0638' : "Z", # ظ
u'\u0639' : "A", # ع
u'\u063a' : "G", # غ
u'\u0641' : "F", # ف
u'\u0642' : "Q", # ق
u'\u0643' : "K", # ك
u'\u0644' : "L", # ل
u'\u0645' : "M", # م
u'\u0646' : "N", # ن
u'\u0647' : "H", # ه
u'\u0648' : "W", # و
u'\u064a' : "Y", # ي
}



def soundex_map_code(word: str):

encoded_word = ''
word_length = len(word)

for i in range(word_length):
char = word[i]
if char in SOUNDEX_CODE:
encoded_word += SOUNDEX_CODE[char]
else:
encoded_word+= str(0)

return encoded_word


def soundex_trim_rep(word: str):

last_char = None
clean_word = ""
word_length = len(word)

for i in range(word_length):
char = word[i]
if char != last_char:
clean_word+= char
last_char = char
return clean_word


def soundex(word:str , length:int = 6) -> str:

soundex = word[0]
soundex = SOUNDEX_TRANSLATION[soundex]
rest = word[1:]

encoded_rest = soundex_map_code(rest)
clean_encoded_rest = soundex_trim_rep(encoded_rest)

soundex += clean_encoded_rest
soundex = soundex.replace("0","")
total_len = len(soundex)

if total_len > length :
soundex = soundex[0:length]
else:
soundex += "0"*(length - total_len)

return soundex
14 changes: 14 additions & 0 deletions tests/test_soundex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/python
# -*- coding=utf-8 -*-
import unittest
import pyarabic.soundex as soundex

class SoundexTestCase(unittest.TestCase):
"""Tests for `number.py`."""

def test_soundex(self):
self.assertEqual(soundex.soundex("عدي",4),"A300")


if __name__ == '__main__':
unittest.main()