-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrule_based_match.py
137 lines (112 loc) · 5.89 KB
/
rule_based_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pandas as pd
from Utils import Config
DATA_DIR = 'Data/Coding_Schemes/'
def fill_individual(x):
"""
Based on NA values, build dictionary of CAP codes whose MAN correspondence should be individually hand-picked
:param x: row of dataframe
:return: updated row
"""
individuals = {
'105': '414', # National Budget -> Economic Orthodoxy
'599': '701', # Labor - Other -> Labour Groups
'900': '607', # Immigration -> Multiculturalism
'1000': '411', # Transportation - General -> Technology and Infrastructure
'1523': '504', # Disaster Relief -> Welfare State
'1701': '411', # Space -> Technology and Infrastructure
'2010': '304', # Scandals -> Political Corruption
'2102': '607' # Indigenous Affairs -> Multiculturalism
}
# I also ensure that all codes have the same format (XXX)
x.MAN = individuals[x.Code] if x.Code in individuals.keys() else str(x.MAN).split('.')[0]
return x
def fill_groups(x):
"""
Some CAP domains can be grouped up as the same. If an exception should be made, individual codes are processed
beforehand, allowing greater control.
:param x: row of dataframe
:return: updated row
"""
groups = {'80': '411', # Energy (6) -> Technology and Infrastructure
'14': '411', # Housing (7) -> Technology and Infrastructure
'16': '104', # Defense (2) -> Military
'21': '501'} # Public Lands (2) -> Environmental Protection
if x.MAN == 'nan':
# Treat the first 2 digits of the CAP code as a group. THIS CAN HAVE PROBLEMS FOR OTHER CODES! (ex: 199 vs 1901)
cap_group = x.Code[:2]
x.MAN = groups[cap_group] if cap_group in groups.keys() else x.MAN
return x
def spanish_media_codes():
"""
Spanish Media files seem to have been coded with its own codebook, heavily based on CAP master codebook. For the
most part, the correspondences between CAP master codebook and MAN codebook apply. Some CAP codes are not included
in Spanish Media codebook. Eight new codes were added to the existing CAP topics, three from an existing CAP Major
Topic (Culture) and five created within new Major Topics (Climate (1), Sports (1) and Death Notices (3)).
This method creates the correspondence between the eight new topics and their respective MAN code. This is done in
the shape of a DataFrame that is later appended to the original correspondence DataFrame.
:return: DataFrame containing correspondence between unique spanish media codes and MAN codes
"""
corrs = {
'2301': '502', # Culture: Cinema, Theatre, etc. -> Culture
'2302': '502', # Culture: Books -> Culture
'2399': '502', # Culture: Others -> Culture
'2700': '501', # Climate: General -> Environmental Protection
'2900': '502', # Sports: General -> Culture
'3001': '000', # Death Notices: Natural Death -> General
'3002': '000', # Death Notices: Violent Death -> General
'3099': '000' # Death Notices: Others -> General
}
return pd.DataFrame(corrs.items(), columns=['Code', 'MAN'])
def reduce(man_code):
"""
In MAN codes where there is sentiment, we only keep one. Code now acts as a general statement from both codes.
Ex: Education Limitation (507) vs Education Expansion (506) => Education (506)
:param man_code: MAN code
:return: MAN code
"""
reduce_dict = {
'102': '101',
'105': '104',
'109': '107',
'110': '108',
'204': '203',
'407': '406',
'505': '504',
'507': '506',
'602': '601',
'604': '603',
'608': '607',
'702': '701'
}
return reduce_dict[man_code] if man_code in reduce_dict.keys() else man_code
df = pd.read_csv(Config.SCHEMES_DIR + 'cap_to_man.csv', dtype='object')
df = df.apply(lambda x: fill_individual(x), axis=1) # Individuals first
df = df.apply(lambda x: fill_groups(x), axis=1) # Groups second
df = df.append(spanish_media_codes(), ignore_index=True) # Spanish Media additional codes
df.MAN = df.MAN.apply(lambda x: reduce(x))
df.to_csv(Config.SCHEMES_DIR + 'cap_to_man.csv', index=False)
cap = pd.read_csv(Config.SCHEMES_DIR + 'CAP.csv', dtype='object')
man = pd.read_csv(Config.SCHEMES_DIR + 'MAN_v4.csv', dtype='object')
def verbose(row):
"""
For visual confirmation of correspondence, I also include a CSV file with each code named explicitly.
:param row: row of dataframe
:return: new row
"""
spanish_codes = {
'2301': 'Culture: Cinema, theatre, music and dance',
'2302': 'Culture: Publication of books and literary works',
'2399': 'Culture: Others',
'2700': 'Climate: General',
'2900': 'Sports: General',
'3001': 'Death Notices: Natural Death',
'3002': 'Death Notices: Violent Death',
'3099': 'Death Notices: Others'
}
row.Code = spanish_codes[row.Code] if row.Code in spanish_codes.keys() \
else ': '.join(cap[cap.Code == row.Code][["Major Topic", "Minor Topic"]].iloc[0].tolist())
# MAN codes are smaller in range and more explicit
row.MAN = man[man.Code == row.MAN]["Minor Topic"].tolist()[0]
return row
verbose_df = df.apply(lambda x: verbose(x), axis=1)
verbose_df.to_csv(Config.SCHEMES_DIR + 'cap_to_man_verbose.csv', index=False)