-
Notifications
You must be signed in to change notification settings - Fork 0
/
Country.py
146 lines (109 loc) · 5.3 KB
/
Country.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import pandas as pd
import numpy as np
# import nltk
# nltk.download('words') if needed
# reference to here https://www.nltk.org/data.html
from nltk.corpus import words as english
COUNTRY_LIST = [line.rstrip('\n').lower() for line in open('Country_list.txt')]
US_STATES_LIST = [line.rstrip('\n').lower() for line in open('US_states_list.txt')]
WORLD_CITIES = pd.read_excel("worldcities.xlsx")
CITY_LIST = [x.lower() for x in WORLD_CITIES['city_ascii'].tolist()]
CITY_TO_COUNTRY = pd.concat([WORLD_CITIES['city_ascii'].str.lower(), WORLD_CITIES['country'].str.lower()], axis=1).\
reindex(WORLD_CITIES['city_ascii'].str.lower().index).\
set_index('city_ascii')
NATIONALITIES = pd.read_excel("nationality_to_nation.xlsx")
NATIONALITY_LIST = [x.lower() for x in NATIONALITIES['Adjectivals'].tolist()]
NATIONALITY_TO_COUNTRY = pd.concat([NATIONALITIES['Adjectivals'].str.lower(), NATIONALITIES['Country/entity name'].str.lower()], axis=1).\
reindex(NATIONALITIES['Adjectivals'].str.lower().index).\
set_index('Adjectivals')
# VOCAB = set(w.lower() for w in english.words())
def find_names_for_google(df_birth_names):
"""
:param df_birth_names: 所有的birth data from the data given by Lu
:return 1: df_country_found,
返回一个dataframe 里面有国家了
先通过country list过滤,有些国家可能有问题(如有好几个名字的(e.g. 荷兰),有些含有特殊符号,如刚果布,刚果金,朝鲜,南朝鲜北朝鲜是三个“国家”,
再比如说南奥塞梯,一些太平洋岛国归属有问题,还就是香港台湾这样的。。。。暂时算是国家)
而后看看是不是美国的一个州。
而后看城市,city在city list里面,citylist 参考 worldcities 数据库。 城市重名了就取人口多的那个城市。如Valencia
:return 2: df_need_google_search,
返回一个dataframe 里面都是不在“国家列表”里面的,也不是美国的州,并且“worldcities database”里面找不到的
"""
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')
# teststr = " happy t00o go 129.129$%^&*("
# answer = ''.join(filter(whitelist.__contains__, teststr))
dirty_list = []
need_searching_list = []
for index, row in df_birth_names.head(30).iterrows():
item = ''.join(filter(whitelist.__contains__, row['birth'])).strip()
# item = row['birth'].replace("'","").strip()
if item is "":# null
dirty_list.append(np.nan)
print(item, " is null")
continue
if item in COUNTRY_LIST: # known countries
dirty_list.append(item)
print(item, " is a country")
continue
if item in US_STATES_LIST: # add us states as United States
dirty_list.append("United States")
print(item, " is a state in the US")
continue
if item in NATIONALITY_LIST: # add national from nationality information e.g. Chinese -> China
nation_from_nationality = NATIONALITY_TO_COUNTRY.loc[item]["Country/entity name"]
dirty_list.append(nation_from_nationality)
print(item, " is a national of a certain country")
continue
if item in CITY_LIST: # known city to country e.g. London -> UK
country_from_city = CITY_TO_COUNTRY.loc[item]["country"]
dirty_list.append(country_from_city)
print(item, " is a city and it has been transformed")
continue
flag1=0 # known city to country e.g. London -> UK
for i in COUNTRY_LIST:
if i in item:
dirty_list.append(i)
print(i, " maybe a country")
flag1 = 1
break
if flag1 == 1:
continue
flag2 = 0
for i in US_STATES_LIST:
if i in item:
dirty_list.append("United States")
print(i, "maybe a state in the US")
flag2 = 1
break
if flag2 == 1:
continue
flag3 = 0
for i in CITY_LIST:
if i in item:
country_from_city = CITY_TO_COUNTRY.loc[i]["country"]
dirty_list.append(country_from_city)
print(i, " maybe a city, and we are attempting to transform it")
flag3 = 1
break
if flag3 == 1:
continue
need_searching_list.append(item)
print("this item: ", item, " is not added")
need_searching_list = list(dict.fromkeys(need_searching_list))# remove duplicates
df_country_found = pd.DataFrame(dirty_list)
df_need_google_search = pd.DataFrame(need_searching_list)
return df_country_found, df_need_google_search
def read(fpath):
"""
:rtype: a dataframe read
"""
df = pd.read_stata(fpath)
return df
if __name__ == "__main__":
fpath = "list.dta"
df = read(fpath)
df_country_found, df_need_google_search = find_names_for_google(df)
print('df_need_google_search的长度 ', len(df_need_google_search))
print('df_country_found 长度: ', len(df_country_found))
# df_temp = clean_names(read(fpath))
# print(df_temp)