-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwhichLanguage.py
133 lines (101 loc) · 4.09 KB
/
whichLanguage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import math as m
import re
#df_english = pd.read_csv("/Users/jean-francoisrajotte/projects/street_lang/language_names/english_names.txt", sep = ",")
#df_french = pd.read_csv("/Users/jean-francoisrajotte/projects/street_lang/language_names/french_names.txt", sep = ",")
df_english = pd.read_csv("/Users/jfr/projects/street_lang/language_names/english_names.txt", sep = ",")
df_french = pd.read_csv("/Users/jfr/projects/street_lang/language_names/french_names.txt", sep = ",")
def getBigramFrequency(df, n=2):
"""Get the frequency for a defined number of letter."""
bigramFreq = {}
nbOccurence = 0
for name in df['Name']:
name = name.lower()
for i, letter in enumerate(name):
bigram = ""
if(i <= len(name) - n):
for ii in range(n):
bigram += str(name[i + ii])
if(bigram in bigramFreq):
bigramFreq[bigram] += 1
nbOccurence += 1
else:
bigramFreq[bigram] = 1
nbOccurence += 1
for letter, occurence in bigramFreq.items():
bigramFreq[letter] = occurence/nbOccurence
return (bigramFreq)
def getBigramFrequencySingleWord(name, n=2):
bigramFreq = {}
nbOccurence = 0
name = name.lower()
for i, letter in enumerate(name):
bigram = ""
if(i <= len(name) - n):
for ii in range(n):
bigram += str(name[i + ii])
if(bigram in bigramFreq):
bigramFreq[bigram] += 1
nbOccurence += 1
else:
bigramFreq[bigram] = 1
nbOccurence += 1
for letter, occurence in bigramFreq.items():
bigramFreq[letter] = occurence/nbOccurence
return (bigramFreq)
##Creating the dictionaries
freqEnglish = list()
freqFrench = list()
for nbLetters in range(4):
freqEnglish.append(getBigramFrequency(df_english, n=nbLetters))
freqFrench.append(getBigramFrequency(df_french, n=nbLetters))
for nbLetters in range(1,4):
for key in freqEnglish[nbLetters]:
if key not in freqFrench[nbLetters]:
freqFrench[nbLetters][key] = 0
for key in freqFrench[nbLetters]:
if key not in freqEnglish[nbLetters]:
freqEnglish[nbLetters][key] = 0
trigramEnglish = collections.OrderedDict(sorted(freqEnglish[3].items(), key=lambda x: x[1]))
trigramFrench = collections.OrderedDict(sorted(freqFrench[3].items(), key=lambda x: x[1]))
trigramDiff = collections.OrderedDict({})
for key,value in trigramFrench.items():
trigramDiff[key] = trigramFrench[key] - trigramEnglish[key]
bigramEnglish = collections.OrderedDict(sorted(freqEnglish[2].items(), key=lambda x: x[1]))
bigramFrench = collections.OrderedDict(sorted(freqFrench[2].items(), key=lambda x: x[1]))
bigramDiff = collections.OrderedDict({})
for key,value in bigramFrench.items():
bigramDiff[key] = (bigramFrench[key] - bigramEnglish[key]) * m.fabs(bigramFrench[key] - bigramEnglish[key])
def clean_str(word):
"""Remove generic stuff from the street.
Like Rue, Avenue...
"""
tosub = 'Rue|Avenue|Chemin|Canal|Place|Rang|Boulevard|Autoroute|Pont|Croissant|De la |des |du |de | road| | Street'
return re.sub(tosub, '', word)
def whichLanguage(word):
"""Return language score.
Positive means more likely to be french
Negative means more likely to be english
"""
freqWord = list()
for nbLetters in range(3):
freqWord.append(getBigramFrequencySingleWord(clean_str(word),
n=(nbLetters + 1)))
bigram = 0
trigram = 0
for key in freqWord[1]:
if key in bigramDiff.keys():
bigram += bigramDiff[key]
for key in freqWord[2]:
if key in trigramDiff.keys():
trigram += trigramDiff[key]
return (bigram + trigram)
if __name__=='__main__':
for word in df_french['Name']:
print(word)
#print(whichLanguage(word, freqEnglish, freqFrench))
print(whichLanguage(word))