-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexpected_patent_influences.py
462 lines (331 loc) · 16.4 KB
/
expected_patent_influences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.linear_model import LinearRegression
import scipy.stats as stats
import ast
import math
import datetime
from dateutil.relativedelta import relativedelta
import random
def classification_to_list(classification):
""" Turns a string of classifications into a list
Args:
classification (str): USPTO classifications, guarenteed to be in a list
Returns:
list: list of different USPTO classifications which are applied to a particular patent
"""
try:
return ast.literal_eval(classification)
except ValueError:
return []
except SyntaxError:
return classification.strip('][').split(',')
def filter_classification(classification):
""" Removes subheadings of patents
Args:
classification (str): a particular USPTO classification, in the form "classification/subheading"
Returns:
str: USPTO classification, without the subheading or backslash
"""
try:
return classification.split("/")[0]
except AttributeError:
return ""
def test_results_df(results_df):
""" Asserts results dataframe expanded as expected
Args:
results_df (pandas dataframe): holds patent data regarding authors, assignees, and classifications
"""
assert len(results_df) == 1247658, "Classification Filter failed"
print("Classification results expanded", end="\n\n")
def get_results_df(fp):
""" Load & expand results dataframe for analysis
Args:
fp (str): filepath to authors/assignee/classification results
Returns:
pandas dataframe: expanded pandas dataframe containing A/A/C results
"""
tqdm.pandas()
results_df = pd.read_csv(fp)
print(f"Expanding classification results...")
results_df["classification"] = results_df["classification"].progress_apply(
classification_to_list)
results_df = results_df.explode("classification")
results_df["filtered_classification"] = results_df["classification"].apply(
filter_classification)
test_results_df(results_df)
return results_df
def patent_stats_per_assignee(results_df):
""" Get average and standard deviation of patents per assignee
Args:
results_df (pandas dataframe): full expanded results dataframe
Returns:
float: average patents per assignee
float: standard deviation of patents per assignee distribution
"""
patents_per_assignee = []
assignee_lengths = []
time_format = "%Y-%m-%d"
print("Calculating patents per assignee statistics & time per patents...")
for assignee in tqdm(list(results_df["assignees"].unique())):
sub_df = results_df[results_df["assignees"] == assignee]
if len(sub_df) > 10:
patents_per_assignee.append(len(sub_df))
start_date = datetime.datetime.strptime(min(sub_df["date"]),
time_format)
end_date = datetime.datetime.strptime(max(sub_df["date"]),
time_format)
assignee_lengths.append((end_date.year - start_date.year) * 12 +
end_date.month - start_date.month)
avg_patents_per_assignee = np.mean(patents_per_assignee)
stdev_patents_per_assignee = np.std(patents_per_assignee)
print(
f"Average patents per assignee: {round(avg_patents_per_assignee,3)}"
f", stdev patents per assignee: {round(stdev_patents_per_assignee,3)}\n"
)
avg_months_per_assignee = np.mean(assignee_lengths)
stdev_months_per_assignee = np.std(assignee_lengths)
print(f"Average time per assignee: {round(avg_months_per_assignee,3)}"
f", stdev time per assignee: {round(stdev_months_per_assignee,3)}\n")
return avg_patents_per_assignee, stdev_patents_per_assignee, avg_months_per_assignee, stdev_months_per_assignee
def time_per_assignee(results_df):
""" Calculating the average time an assignee exists for
Args:
results_df (pandas dataframe): full expanded results dataframe
Returns:
float: average time (in months) an assignee sticks arount
float: standard deviation of time distribution
"""
assignee_lengths = []
print("Calculating average time per assignee...")
for assignee in tqdm(list(results_df["assignees"].unique())):
sub_df = results_df[results_df["assignees"] == assignee]
if len(sub_df) > 10:
format = "%Y-%m-%d"
start_date = datetime.datetime.strptime(min(sub_df["date"]), format)
end_date = datetime.datetime.strptime(max(sub_df["date"]), format)
assignee_lengths.append((end_date.year - start_date.year) * 12 +
end_date.month - start_date.month)
avg_months_per_assignee = np.mean(assignee_lengths)
std_months_per_assignee = np.std(assignee_lengths)
print(f"Average time per assignee: {round(avg_months_per_assignee,3)}"
f", stdev time per assignee: {round(std_months_per_assignee,3)}\n")
return avg_months_per_assignee, std_months_per_assignee
def build_profiles(avg_patents_per_assignee, stdev_patents_per_assignee,
num_profiles):
""" Build random assignee profiles, each profile having x patents in it
Args:
avg_patents_per_assignee (float): average patents per assignee
stdev_patents_per_assignee (float): standard deviation of patents per assignee distribution
num_profiles (int): number of profiles to build
Returns:
list: list of n profiles, each with a size based on previously calculated statistics
"""
print("Generating random profiles...")
profile_patent_size = []
lower = 1
upper = 10000 #upper value set to be arbitarily high
profile_patent_size = stats.truncnorm.rvs(
(lower - avg_patents_per_assignee) / stdev_patents_per_assignee,
(upper - avg_patents_per_assignee) / stdev_patents_per_assignee,
loc=avg_patents_per_assignee,
scale=stdev_patents_per_assignee,
size=num_profiles)
profile_patent_size = [int(math.floor(x)) for x in profile_patent_size]
print("Random profile length:", len(profile_patent_size), ";",
profile_patent_size[0:10], "\n")
return profile_patent_size
def random_dates2(start, end, n, unit='D', seed=None):
"""
Calculates random dates in YYYY-MM-DD format
(from https://stackoverflow.com/questions/50559078/generating-random-dates-within-a-given-range-in-pandas)
"""
ndays = (end - start).days + 1
return start + pd.to_timedelta(np.random.randint(0, ndays, n), unit=unit)
def get_starting_time(months):
""" Generate a random starting time for a given "profile" - this is the date of the first patent
Samples are guaranteed to be between Jan 1976 & (Dec 2022 - number of months)
Args:
months (int): number of months that a particular profile exists for
Returns:
(list of pyDateTime objects): randomly generated samples, all in pyDateTime format with year, month, and date
"""
low_range = datetime.datetime(year=1976, month=1, day=1)
high_range = datetime.datetime(year=2022, month=12, day=31) - relativedelta(
months=months, day=0)
#Generate a random starting date, between the low & high range
start_date = random_dates2(low_range, high_range, 1,
seed=random.randint).to_pydatetime()[0]
## Randomly generate num_patents numbers of timestamps
return start_date
def get_timestamps(avg_months_per_assignee, stdev_months_per_assignee,
num_profiles):
### For each profile, calculate starting and ending time
# List of list data structure: [[t_start, t_end], [t_start, t_end], ..., [t_start, t_end]],
# t is a timestamp, start/end is the starting and ending timestamp of that particular "company" profile
print("Generating a set of timestamps for each profile...")
profile_timestamps = []
#Sample number of months
lower = 1
upper = 556 #upper value set to be arbitarily high
num_months = stats.truncnorm.rvs(
(lower - avg_months_per_assignee) / stdev_months_per_assignee,
(upper - avg_months_per_assignee) / stdev_months_per_assignee,
loc=avg_months_per_assignee,
scale=stdev_months_per_assignee,
size=num_profiles)
num_months = [math.floor(x) for x in num_months]
for months in tqdm(num_months):
#Sample number of compounds per patent
starting_timestamp = get_starting_time(months)
ending_timestamp = starting_timestamp + relativedelta(months=months,
day=0)
profile_timestamps.append((starting_timestamp, ending_timestamp))
print("Random profile timestamps:", len(profile_timestamps), ";",
profile_timestamps[0:10], "\n")
return profile_timestamps
def get_all_profile_patent_timestamps(random_profile_sizes,
random_profile_timestamps):
### For each profile, generate timestamps of all patents
# Data structure: list of lists, each sublist with n timestamps (n from profile_patent_size)
print("Building timestamps for all patents...")
profile_patent_timestamps = []
# Loop through patent sizes (number of patents per company), and starting & ending point
for i in tqdm(range(len(random_profile_sizes))):
#Generate a list of timestamps for patents, uniformly sampled from within the starting & ending point of the "company"
timestamps = random_dates2(random_profile_timestamps[i][0],
random_profile_timestamps[i][1],
random_profile_sizes[i],
seed=random.randint).to_pydatetime()
profile_patent_timestamps.append(timestamps)
print(f"All {len(profile_patent_timestamps)} patent timestamps built.")
print(f"0th profile length: {len(profile_patent_timestamps[0])}, {profile_patent_timestamps[0][0:10]}\n")
return profile_patent_timestamps
def get_avg_MA_per_timestamped_patent(timestamp, MA_month_avg_dict,
MA_month_std_dict, n_cpds):
""" Calculates a list of expected MAs from a given timestamp
Args:
timestamp (datetime.datetime object): Specific date of a random "patent"
MA_df (pandas dataframe): holds MA avg & std dev values at each month
n_cpds (int): number of compounds in a given patent
"""
#Get year & month of timestamp
month = timestamp.month
if month < 10:
month = "0" + str(timestamp.month)
else:
month = str(timestamp.month)
MA_avg = MA_month_avg_dict[str(timestamp.year) + "-" + month]
MA_std = MA_month_std_dict[str(timestamp.year) + "-" + month]
lower = 1
upper = 1000 #upper value set to be arbitarily high
MAs = stats.truncnorm.rvs((lower - MA_avg) / MA_std,
(upper - MA_avg) / MA_std,
loc=MA_avg,
scale=MA_std,
size=n_cpds)
#MAs = np.random.normal(avg_cpds_per_patent, stdev_cpds_per_patent, n_cpds)
return np.mean(MAs)
def get_MAs_of_patents(profile_patent_timestamps, avg_cpds_per_patent,
stdev_cpds_per_patent, MA_month_avg_dict,
MA_month_std_dict):
### Get timestamps of compounds - multiply each timestamp by the number of compounds within a patent
# Data structure - list of lists, each sublist with n * [c0, c1, c2, ...] timestamps
# (n = number of patents, [c0, c1, ..] = number of compounds per patent)
profile_avg_MAs = []
profile_cpds_per_patent = []
print("Finding MA values per profile...")
for profile in tqdm(profile_patent_timestamps):
avg_MAs = []
n_cpds_per_profile = []
for timestamp in profile:
#Sample number of cpds
lower = 1
upper = 10000 #upper value set to be arbitarily high
n_cpds = stats.truncnorm.rvs(
(lower - avg_cpds_per_patent) / stdev_cpds_per_patent,
(upper - avg_cpds_per_patent) / stdev_cpds_per_patent,
loc=avg_cpds_per_patent,
scale=stdev_cpds_per_patent,
size=1)[0]
n_cpds_per_profile.append(math.floor(n_cpds))
avg_MAs.append(
get_avg_MA_per_timestamped_patent(timestamp, MA_month_avg_dict,
MA_month_std_dict,
math.floor(n_cpds)))
# print(f"\tTesting: profile size = {len(profile)}, avg MAs[0:10] = {avg_MAs[0:10]}")
# print(f"\tn_cpds_per_profile = {len(n_cpds_per_profile)}, avg_cpds_per_patent: {avg_cpds_per_patent}")
profile_avg_MAs.append(avg_MAs)
profile_cpds_per_patent.append(n_cpds_per_profile)
print(f"Found {len(profile_avg_MAs)} MA values: {profile_avg_MAs[0][0:10]}\n")
return profile_avg_MAs, profile_cpds_per_patent
def get_deltaMA(profile_avg_MAs, profile_patent_timestamps):
### Get deltaMA profile of expected data
print("Fnding deltaMA slopes...")
linear_regressor = LinearRegression()
profile_MA_slopes = []
for i in tqdm(range(len(profile_avg_MAs))):
X = np.array([x.toordinal() for x in profile_patent_timestamps[i]
]).reshape(-1, 1)
Y = np.array(profile_avg_MAs[i]).reshape(-1, 1)
#try:
reg = linear_regressor.fit(X, Y)
# Y_pred = linear_regressor.predict(X)
# r2 = reg.score(X, Y)
#Multiply the occurances of the slope by the number of compounds per "assignee"
profile_MA_slopes.extend([reg.coef_[0][0] * (max(X) - min(X))[0]
]) #*sum(profile_cpds_per_patent[i]))
print(
f"Found {len(profile_MA_slopes)} deltaMA values; {profile_MA_slopes[0:10]}\n"
)
return profile_MA_slopes
def get_positive_slopes(profile_MA_slopes):
""" Find the number of positive slopes from a particular sample
Args:
profile_MA_slopes (list): list of deltaMA slopes
"""
positive_count = 0
for MA_slope in profile_MA_slopes:
if MA_slope > 0:
positive_count += 1
positive_slope_percentage = float(positive_count) / len(profile_MA_slopes)
print(f"Percent positive slopes: {positive_slope_percentage}")
return positive_slope_percentage
def main():
#Load results dataframe and MA data
results_df = get_results_df("Data/Patents/patent_MA_results.csv")
MA_df = pd.read_csv("Data/AssemblyValues/MA_df_months_FULL.csv")
MA_month_avg_dict = dict(zip(MA_df.month, MA_df.avg))
MA_month_std_dict = dict(zip(MA_df.month, MA_df.stdev))
#Precalculated (see deltaMA_factors.ipynb)
avg_cpds_per_patent = 108.41867561794324
stdev_cpds_per_patent = 272.51858323850996
avg_patents_per_assignee, stdev_patents_per_assignee, avg_months_per_assignee, stdev_months_per_assignee = patent_stats_per_assignee(
results_df)
print(f"Avg Patents Per Assignee: {avg_patents_per_assignee}\n"
f"Stdev Patents Per Assignee: {stdev_patents_per_assignee}\n"
f"Avg Months Per Assignee: {avg_months_per_assignee}\n"
f"Stdev Months Per Assignee: {stdev_months_per_assignee}")
print("\n" + " - " * 20 + "\n")
num_profiles = 769
random_profile_sizes = build_profiles(avg_patents_per_assignee,
stdev_patents_per_assignee,
num_profiles)
print(f"Random profile sizes: {random_profile_sizes[0:10]}")
print("\n" + " - " * 20 + "\n")
random_profile_timestamps = get_timestamps(avg_months_per_assignee,
stdev_months_per_assignee,
num_profiles)
print(f"Random profile timestamps: {random_profile_timestamps[0:10]}")
print("\n" + " - " * 20 + "\n")
profile_patent_timestamps = get_all_profile_patent_timestamps(
random_profile_sizes, random_profile_timestamps)
profile_avg_MAs, profile_cpds_per_patent = get_MAs_of_patents(
profile_patent_timestamps, avg_cpds_per_patent, stdev_cpds_per_patent,
MA_month_avg_dict, MA_month_std_dict)
profile_MA_slopes = get_deltaMA(profile_avg_MAs, profile_patent_timestamps)
percent_positive = get_positive_slopes(profile_MA_slopes)
if __name__ == "__main__":
main()