forked from tuub/oa-eval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
846 lines (762 loc) · 34.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
####################################################################################
# This work is distributed under a BSD 3-Clause License.
# license terms see https://opensource.org/licenses/BSD-3-Clause
# developed mainly by Eva Bunge (ORCiD:0000-0002-5587-5934, github.com/ebunge)
# with support from Michaela Voigt (ORCiD:0000-0001-9486-3189, github.com/michaelavoigt)
# maintained by the Open Access team of TU Berlin University Library
####################################################################################
import numpy as np
import collections
import cPickle as pickle
import weakref
from prettytable import PrettyTable
import time
import graphics as gr
import cr
t = time.time()
# ----------------- 1. Enable/Disable Functionalities -------------------------
# Detailed instructions for querying databases and preparing DOAJ data
# are included in the manual (in German).
# These variables are used to run only certain parts of the script. Set
# them all to 'True' to run the whole script.
# doAnalysis: If True - do some statistics and analysis of the final data. If
# False - disable this feature.
# doReadIn: If True - read in the database data from the text files and save it
# to file 'finalList'. If False - data is loaded from file 'finalList'
doAnalysis = True
doReadIn = True
# Variable that determines whether to contact CrossRef. Possible values:
# 1: Contact to contact the CrossRef API to crossreference your article data
# with the licence information available at CrossRef. The CrossRef data is
# then saved in a file 'CRResults', so that it can be used at a later date
# without having to contact the API again.
# 2: Load the previously saved CrossRef data from 'CRResults'
# 0: Disable the feature completely.
contactCR = 1
# Decide what to do when a corresponding/first author of a publication can't be
# determined manually. Possible values:
# 1: Write all articles to a file (docsToBeChecked.txt) for which it is
# impossible to determine the corresponding/first author automatically.
# Once you have determined which aricles have a first/corresponding author
# from a relevant institution, save the data in a tab-delimited, utf8
# encoded text-file called 'docsChecked.txt', with the following format:
# each line corresponds to an article and the three columns contain (in this
# order): title, DOI, name of institution (which should be spelled identical
# to the name of the institution as set up in this script)
# 2: Load this information into the script and have it included in the final
# results and the statistic.
# 0: Disable this feature completely.
checkToDo = 1
# ----------------- 2. Setting up Classes and Functions -----------------------
# Set up class for institutions
class inst(object):
instances = []
nameVar1 = None
def __init__(self, name, nameVariants):
self.__class__.instances.append(weakref.proxy(self))
self.name = name
self.nameVar = nameVariants
# Set up class for databases
class Database(object):
instancesdb = []
content = None
def __init__(self, name, idNummer):
self.__class__.instancesdb.append(weakref.proxy(self))
self.name = name
self.idNummer = idNummer
# Set up class for documents
class Document(object):
collaborators = None
nameVariant = None
doajSubject = None
publisher = None
lizenz = None
checks = ''
def __init__(self, authors, title, DOI, journal, ISSN, eISSN, year,
affiliations, corrAuth, eMail, subject, funding, dbID):
self.authors = authors
self.title = title
self.DOI = DOI
self.journal = journal
self.ISSN = ISSN
self.eISSN = eISSN
self.year = year
self.affiliations = affiliations
self.corrAuth = corrAuth
self.eMail = eMail
self.subject = subject
self.funding = funding
self.dbID = dbID
def konsonanten(self):
d = ' '.join([''.join([item for item in self.authors if item in vokale])\
.lower()[0:3],
''.join([item for item in self.title if item in vokale])\
.lower()[0:19]])
return d
def arry(self):
return [self.authors, self.title, self.DOI, self.journal, self.ISSN,
self.eISSN, self.year, self.affiliations, self.corrAuth,
self.nameVariant, self.eMail, self.subject, self.doajSubject,
self.funding, self.publisher, self.lizenz,
self.dbID, self.checks, self.collaborators]
# Function that takes consonants from a title and turns them into a string
# INPUT: title of a publication (string)
# OUTPUT: first twenty consonants of the title (string)
def kons(title):
d = ''.join([item for item in title if item in vokale]).lower()[0:19]
return d
# Function that takes data in WoS-format and transforms the data into a list of
# Document-objects.
# INPUT: (list of records as described in section 4, database ID (integer))
# OUTPUT: list of Document-objects
def wosFormat(wosRecords, ind):
records = []
i = 0
with open(wosRecords, 'rU') as f:
for line in f:
if i == 0:
newDoc = Document('', '', None, None, None,
None, None, '', None, None, None, None, ind)
i += 1
lengths = len(line)
if line[0:2] != ' ':
kuerzel = line[0:2]
if line[0:2] == 'TI':
newDoc.title = line[3:lengths].strip('\n').strip('\r')
elif line[0:2] == 'SO':
newDoc.journal = line[3:lengths].strip('\n').strip('\r')
elif line[0:2] == 'PY':
newDoc.year = line[3:lengths].strip('\n').strip('\r')
elif line[0:2] == 'SN':
if '-' not in line:
vorl = line[3:lengths].strip('ISSN ').strip('\n').strip('\r')
newDoc.ISSN = vorl[0:4] + '-' + vorl[4:8]
elif ',' in line:
newDoc.ISSN = line.strip('ISSN ').strip('\n').strip('\r')[0:9]
else:
newDoc.ISSN = line[3:lengths].strip('ISSN ').strip('\n').strip('\r')
elif line[0:2] == 'DI':
newDoc.DOI = line[3:lengths].strip('\n').strip('\r')
elif line[0:2] == 'AF':
newDoc.authors = line[3:lengths].strip('\n').strip('\r')
elif kuerzel == 'AF' and line[0:2] == ' ':
newDoc.authors += '; '
newDoc.authors += line[3:lengths].strip('\n').strip('\r')
elif line[0:2] == 'FN':
records.append(newDoc)
newDoc = Document('', '', None, None, None,
None, None, '', None, None, None, None, ind)
records.append(newDoc)
return records
# Checks a given name of an institution against a list of approved name variants.
# INPUT: (text string with name of an institution, database ID)
# OUTPUT: (bool stating if instition is part of list, name of institution (string))
def listCheck(institution, ide):
val = [None] * len(institutions)
variant = None
for i in range(0, len(val)):
if ide in [dbWoS.idNummer, dbSF.idNummer]:
namensVar = institutions[i].nameVar
else:
namensVar = institutions[i].nameVar1
instList = [item for item in namensVar]
valu = [None] * len(instList)
for j in range(0, len(instList)):
valu[j] = all(word in institution for word in instList[j])
if any(valu):
variant = institutions[i].name
val[i] = True
break
return (any(val), variant)
# Function that takes data in PubMed-format and transforms it into list of Documents.
# INPUT: (list of records as described in section 4, database ID (integer))
# OUTPUT: list of Documents
def pubmedFormat(pmRecords, ind):
records = []
i = 0
authorCount = 0
newDoc = None
with open(pmRecords, 'rU') as f:
for line in f:
lengths = len(line)
if line[0:2] != ' ':
kuerzel = line[0:4]
if line[0:4] == 'PMID':
authorCount = 0
if i > 0:
records.append(newDoc)
newDoc = Document('', '', None, None, None,
None, None, '', None, None, None, None, ind)
i += 1
elif line[0:2] == 'TI':
newDoc.title = line[6:lengths].strip('\n').strip('\r')
elif kuerzel == 'TI ' and line[0:2] == ' ':
newDoc.title += ' ';
newDoc.title += line[6:lengths].strip('\n').strip('\r')
elif line[0:2] == 'IS' and line[-5:-2] == 'nic':
newDoc.eISSN = line[6:15]
elif line[0:2] == 'IS' and line[-5:-2] == 'ing':
newDoc.ISSN = line[6:15]
elif line[0:3] == 'FAU' and authorCount > 0:
newDoc.authors += '; '
newDoc.authors += line[6:lengths].strip('\n').strip('\r')
authorCount += 1
elif line[0:3] == 'FAU' and authorCount == 0:
newDoc.authors = line[6:lengths].strip('\n').strip('\r')
newDoc.corrAuth = newDoc.authors + '; '
authorCount += 1
elif authorCount == 1 and line[0:2] == 'AD':
newDoc.corrAuth += line[6:lengths].strip('\n').strip('\r')
newDoc.affiliations = line[6:lengths].strip('\n').strip('\r')
elif line[0:2] == ' ' and kuerzel == 'AD ' and authorCount == 1:
newDoc.affiliations += line[5:lengths].strip('\n').strip('\r')
newDoc.corrAuth += line[5:lengths].strip('\n').strip('\r')
elif line[0:2] == ' ' and kuerzel == 'AD ' and authorCount > 1:
newDoc.affiliations += line[5:lengths].strip('\n').strip('\r')
elif authorCount > 1 and line[0:2] == 'AD':
newDoc.affiliations += '; '
newDoc.affiliations += line[5:lengths].strip('\n').strip('\r')
elif line[0:2] == 'JT':
newDoc.journal = line[6:lengths].strip('\n').strip('\r')
elif line[0:2] == 'DP':
newDoc.year = line[6:10]
elif line[0:3] == 'LID' and 'doi' in line:
newDoc.DOI = line[6:lengths].strip('\n').strip('\r').strip(' [doi]')
records.append(newDoc)
return records
# List of consonants
vokale = ('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r',
's', 't', 'v', 'w', 'x', 'y', 'z', 'B', 'C', 'D', 'F', 'G', 'H', 'J',
'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z')
# Duplicate check
# This function takes a set of data (= masterList) and compares incoming new
# data with it. All duplicates are removed from the new data which is then
# added to the masterList. This action is repeated for each database.
# Comparisons are done via DOI-matching and then title/author-matching.
# doiList = list that contains all DOIs from masterList.
# konsMast = a list of strings which consist of the first four consonants of
# the authors' names and the first twenty consonants of the title for the data
# in the masterlist.
# INPUT: (iterating integer, list containing article data as described
# in section 4, list of DOIs (strings), list of strings for
# title/author-matching)
# OUTPUT: list containing Documents with duplicates removed
def dubletten(iterates, masterList, dois, kons):
if iterates == 1:
nowList = contentSF
else:
nowList = datenbanken[iterates].content
if iterates == 1:
doiList = [x.DOI for x in masterList]
konsMast = [item.konsonanten() for item in masterList]
else:
doiList = dois
konsMast = kons
f = collections.Counter(konsMast)
i = len(nowList)
print datenbanken[iterates].name, ' - number of records: ', i
nowList = [item for item in nowList if item.DOI == None
or item.DOI.strip('"') == ''
or item.DOI.strip('"') not in doiList]
j = len(nowList)
print datenbanken[iterates].name, \
' - number of records removed via DOI-matching: ', \
i-j
nowList = [item for item in nowList if item.authors != None
and item.konsonanten() not in f]
k = len(nowList)
print datenbanken[iterates].name, \
' - number of records removed via title/author-matching: ', \
j-k
print datenbanken[iterates].name, \
' - number of records added to masterList: ', \
k
if k > 0:
masterList += nowList
doiList += [x.DOI for x in nowList]
konsMast += [item.konsonanten() for item in nowList]
iterates += 1
if iterates < len(datenbanken):
masterList = dubletten(iterates, masterList, doiList, konsMast)
return masterList
# -------------------- 3. Set up Institutions ---------------------------------
# Set up institutions. Format for name variants:
# [[var1,var2],[var3]] is equivalent to: (var1 AND var2) OR (var3)
# Careful: the name variant used when querying the database is not
# necessarily the same name variant used in the raw data.
# TU
TUnames = [['Tech', 'Univ', 'Berlin'], ['Berlin', 'TU'],
['Berlin', 'Inst', 'Technol']]
TU = inst('TU', TUnames)
# Charité
Cnames = [['Charit', 'Univ'], ['Campus', 'Virchow', 'Berlin'],
['Campus', 'Franklin', 'Berlin'], ['Charit', 'Berlin']]
Charite = inst('Charité', Cnames)
# FU
FUnames = [['Berlin','FU'], ['Berlin','Free','Univ'], ['Berlin','Frei','Univ']]
FU = inst('FU', FUnames)
# HU
HUnames = [['Berlin', 'HU'], ['Berlin', 'Humboldt', 'Univ']]
HU = inst('HU', HUnames)
# UdK
UdKnames = [['Univ', 'Arts', 'Berlin'], ['Univ', 'Kunst', 'Berlin'],
['Berlin', 'UdK']]
UdK = inst('UdK', UdKnames)
# Beuth
Bnames = [['Beuth', 'Berlin']]
Beuth = inst('Beuth', Bnames)
# HTW
HTWnames = [['HTW', 'Berlin'], ['Tech', 'Wirt', 'Berlin']]
HTW = inst('HTW', HTWnames)
# HWR
HWRnames = [['HWR','Berlin'], ['Wirt','Recht','Berlin']]
HWR = inst('HWR', HWRnames)
# Alice Salomon
ASHnames = [['Alice','Salomon','Berlin'], ['ASH','Berlin']]
ASH = inst('ASH', ASHnames)
#Create list of institutions
institutions = [x for x in inst.instances]
# Set up differing name variants for PubMed data if needed
for item in institutions:
item.nameVar1 = item.nameVar
TU.nameVar1 = [['Technische Universitat Berlin'],
['Technische Universitaet Berlin'],
['Technische Universität Berlin'],
['Berlin Institute of Techn'],
['Tech Univ Berlin'],
['Berlin Univ Technol'],
['Univ Technol Berlin'],
['TU Berlin'],
['Tech. Univ. Berlin'],
['Berlin Inst Technol'],
['Technical University Berlin'],
['Technische Universitaet de Berlin'],
['Technical University of Berlin'],
['Berlin University of Technology']]
# ------------- 4. Read in Text Files and Extract Information -----------------
# Set up databases
dbWoS = Database('Web of Science', 1)
dbSF = Database('SciFinder', 2)
dbPM = Database('PubMed', 3)
dbTEMA = Database('TEMA', 4)
dbInspec = Database('Inspec', 5)
dbIEEE = Database('IEEE', 6)
dbPQ = Database('ProQuest', 7)
dbBSC = Database('Business Source Complete', 9)
dbGf = Database('GeoRef', 10)
dbCAB = Database('CAB Abstracts', 11)
dbCIN = Database('CINAHL', 12)
dbEB = Database('EBSCO', 13)
dbEm = Database('Embase', 14)
dbLisa = Database('LISA', 15)
dbScopus = Database('Scopus', 16)
dbSD = Database('SportDiscus', 17)
# List the databases
datenbanken = [x for x in Database.instancesdb]
# Read in database contents from text-files
if doReadIn == True:
# Read in the 'Web of Science' file and extract the relevant information.
contentWoS = []
with open('wos20xx.txt') as f:
ic = 0
for line in f:
fields = line.split('\t')
if ic > 0:
contentWoS.append(Document(fields[1], fields[8], fields[54],
fields[9], fields[38], fields[39],
fields[44], fields[22], fields[23],
fields[24], fields[58], fields[27],
dbWoS.idNummer))
else:
ic += 1
dbWoS.content = contentWoS
print 'Finished reading in Web of Science'
# Read in the 'SciFinder' files and extract the relevant information.
contentSF = []
with open('sf20xx.txt', 'rU') as f:
ic = 0
for line in f:
fields = line.split('\t')
if ic > 0:
contentSF.append(Document(fields[6], fields[3],
fields[49].strip('\n').strip('\r'),
fields[17], fields[15], None,
fields[22], None, fields[11],
None, fields[9], None,
dbSF.idNummer))
else:
ic += 1
dbSF.content = contentSF
print 'Finished reading in SciFinder'
# Read in 'PubMed' file and extract relevant information.
dbPM.content = pubmedFormat('pubmed20xx.txt', dbPM.idNummer)
print 'Finished reading in PubMed'
# Read in 'TEMA' file and extract relevant information.
dbTEMA.content = wosFormat('tema20xx.txt', dbTEMA.idNummer)
print 'Finished reading in TEMA'
# Read in 'Inpsec' file and extract relevant information.
contentInspec = []
with open('inspec20xx.txt') as f:
ic = 0
for line in f:
fields = line.split('\t')
if ic > 0:
contentInspec.append(Document(fields[6], fields[5], fields[-5],
fields[12], fields[-6], None,
fields[13], fields[-21], None,
None, None, None,
dbInspec.idNummer))
else:
ic += 1
dbInspec.content = contentInspec
print 'Finished reading in Inspec'
# Read in 'IEEE' file and extract relevant information.
dbIEEE.content = wosFormat('ieee20xx.txt', dbIEEE.idNummer)
print 'Finished reading in IEEE'
# Read in 'ProQuest' file and extract relevant information.
dbPQ.content = wosFormat('pq20xx.txt', dbPQ.idNummer)
print 'Finished reading in ProQuest'
# Read in 'Business Source Complete' file and extract relevant information.
dbBSC.content = wosFormat('bsc20xx.txt', dbBSC.idNummer)
print 'Finished reading in Business Source Complete'
# Read in 'GeoRef' file and extract relevant information.
dbGf.content = wosFormat('gf20xx.txt', dbGf.idNummer)
print 'Finished reading in GeoRef'
# Read in 'CAB Abstracts' file and extract relevant information.
dbCAB.content = wosFormat('cab20xx.txt', dbCAB.idNummer)
print 'Finished reading in CAB Abstracts'
# Read in 'CINAHL' file and extract relevant information.
dbCIN.content = wosFormat('cinahl20xx.txt', dbCIN.idNummer)
print 'Finished reading in CINAHL'
# Read in 'EBSCO' file and extract relevant information.
dbEB.content = wosFormat('ebsco20xx.txt', dbEB.idNummer)
print 'Finished reading in EBSCO'
# Read in 'Embase' file and extract relevant information.
dbEm.content = wosFormat('embase20xx.txt', dbEm.idNummer)
print 'Finished reading in Embase'
# Read in 'LISA' file and extract relevant information.
dbLisa.content = wosFormat('lisa20xx.txt', dbLisa.idNummer)
print 'Finished reading in LISA'
# Read in 'Scopus' file and extract relevant information.
dbScopus.content = wosFormat('scopus20xx.txt', dbScopus.idNummer)
print 'Finished reading in Scopus'
# Read in 'SportDiscus' file and extract relevant information.
dbSD.content = wosFormat('sd20xx.txt', dbSD.idNummer)
print 'Finished reading in Sport Discus'
# do not set up a new database below this line!
# Transform all characters in DOIs to lower case
for item in datenbanken:
for article in item.content:
if article.DOI != None:
article.DOI = article.DOI.lower()
# ----------------------- 5. Duplicate Check ----------------------------------
# Calls the function 'dubletten' above and prints statistics or reads in data
# from previous run of the script
if doReadIn == True:
print 'Remove Duplicates:'
print 'Number of records in "Web of Science": ', len(contentWoS)
finalList = dubletten(1, contentWoS, None, None)
with open('finalList', "wb") as f:
pickle.dump(finalList, f)
print 'Final number of publications: ', len(finalList)
elif doReadIn == False:
with open('finalList', "rb") as f:
finalList = pickle.load(f)
# Check for duplicates within a database via DOI-matching
seen = set()
doubles = []
for x in finalList:
if x.DOI != '' and x.DOI != None:
if x.DOI not in seen:
seen.add(x.DOI)
else:
doubles.append(x)
for item in doubles:
if item in finalList:
finalList.remove(item)
# -------------------------- 6. Add DOAJ Data ---------------------------------
# Reads in the file with the data from DOAJ and crossreferences it with the
# ISSNs and eISSNs from the database data.
doaj = np.loadtxt('doaj.txt', dtype = 'string', comments = '$#',
skiprows = 1, delimiter = '\t',
usecols = (3, 4, 0, 56, 11, 12, 5, 44))
print 'Finished reading in DOAJ data'
issns = collections.Counter(doaj[:,0])
eissns = collections.Counter(doaj[:,1])
oaList = [item for item in finalList
if (item.ISSN != '' and (item.ISSN in eissns or item.ISSN in issns))
or (item.eISSN != '' and (item.eISSN in eissns or item.eISSN in issns))]
print 'Finished identifying OA articles'
# Add information about the subject, publisher and journal licence
for item in oaList:
if item.ISSN != '' and item.ISSN != None:
j = np.where(doaj == item.ISSN)
item.doajSubject = str(doaj[j[0],3]).strip('[').strip(']').strip("'")
item.publisher = str(doaj[j[0],6]).strip('[').strip(']').strip("'")
item.lizenz = str(doaj[j[0],7]).strip('[').strip(']').strip("'")
if item.ISSN == '' and item.eISSN != None:
k = np.where(doaj == item.eISSN)
item.doajSubject = str(doaj[k[0],3]).strip('[').strip(']').strip("'")
item.publisher = str(doaj[j[0],6]).strip('[').strip(']').strip("'")
item.lizenz = str(doaj[j[0],7]).strip('[').strip(']').strip("'")
# ----------------------- 7. Add CrossRef Data --------------------------------
# Take non-OA articles that have a DOI and send them to the CrossRef API to
# see if there's information about a Creative Commons license.
# For items that are found to be OA after all, the attribute 'checks' is set to
# 'License added via CrossRef'. These are added to the list of OA publications 'oaList'
if contactCR == 1:
print 'Begin querying CrossRef'
nonOA = [item for item in finalList
if item not in oaList
and item.DOI != None
and item.DOI != '']
dochOA, toDOAJ = cr.askCR(nonOA)
for item in dochOA:
item.lizenz = str(item.lizenz)
item.ISSN = str(item.ISSN)
item.checks += 'Licence added via CrossRef'
print 'Finished querying CrossRef'
oaDOAJ = cr.askDOAJ(toDOAJ)
with open('CRResults', "wb") as f:
pickle.dump(dochOA, f)
with open('CRResultsDOAJ', "wb") as f:
pickle.dump(oaDOAJ, f)
print 'Saved CrossRef data to file "CRResults"'
for item in oaDOAJ:
if 'DOAJ=1' in item.checks:
oaList.append(item)
hybrid = [item for item in dochOA if item not in oaList]
elif contactCR == 2:
dochOA = []
oaDOAJ = []
with open('CRResultsDOAJ', "rb") as f:
oaDOAJ = pickle.load(f)
with open('CRResults', "rb") as f:
print 'Load CrossRef data from file'
dochOA = pickle.load(f)
for item in dochOA:
item.lizenz = str(item.lizenz)
item.ISSN = str(item.ISSN)
cL1 = len([x for x in dochOA if "creativecommons.org" in x.lizenz])
cL2 = len([x for x in dochOA if "authorchoice" in x.lizenz])
print 'Number of DOIs for which CC license was found via CrossRef: ', cL1
print 'Number of DOIs for which ACS license was found via CrossRef: ', cL2
for item in dochOA:
if len(item.authors) > 200:
item.authors = item.authors[0:200]
for item in oaDOAJ:
if len(item.authors) > 200:
item.authors = item.authors[0:200]
for item in oaDOAJ:
if 'DOAJ=1' in item.checks:
oaList.append(item)
hybrid = [item for item in dochOA if item not in oaList]
# -- 8. Find out if Authors from relevant Institutions = First/Corr. Author ---
# Adds information about found name variants and shortens author-list
for item in finalList:
if item.dbID in [dbWoS.idNummer, dbSF.idNummer, dbPM.idNummer]:
i, j = listCheck(item.corrAuth, item.dbID)
if i == True:
item.nameVariant = j
if len(item.authors) > 200:
item.authors = item.authors[0:200]
# Find publications from WoS, SciFinder and PubMed for which a name variant of
# a relevant institution was found.
oaWoS = [item for item in oaList if item.dbID == dbWoS.idNummer]
oaWoS = [item for item in oaWoS if item.nameVariant != None]
oaWoS += [item for item in oaList if item.dbID == dbSF.idNummer]
pubMedTest = [item for item in oaList if item.dbID == dbPM.idNummer
and item.nameVariant != None]
if len(pubMedTest) > 0:
oaWoS += pubMedTest
# Write list of articles that need to be checked by hand into a file
# 'docsToBeChecked.txt'
toCheck = [item for item in oaList if item.dbID not in [dbWoS.idNummer,
dbSF.idNummer,
dbPM.idNummer]]
ch = 'authors\ttitle\tDOI\tjournal\tISSN\teISSN\tyear\taffiliations\t\
corresponding author\tfound name variant\te-mail\tsubject\t\
DOAJ subject\tfunding\tpublisher\tlizenz\tdatabaseID\tnotes\tcollaborators'
if checkToDo == 1:
np.savetxt('docsToBeChecked.txt', [item.arry() for item in toCheck],
delimiter='\t', header = ch, comments = '', fmt="%s")
# Read in articles that were checked by hand and were found to have a
# first/corresponding author from a relevant institution. Add those articles
# to the list of OA articles with first/corresponding author from a relevant
# institution.
elif checkToDo == 2:
addDocs = []
doiList = [x.DOI for x in toCheck if x.DOI != None]
titles1 = [x.konsonanten()[4:] for x in toCheck]
dontknow = []
with open('docsChecked.txt') as f:
for line in f:
fields = line.split('\t')
fields[0] = fields[0].strip('\xef\xbb\xbf')
fields[0] = fields[0].strip('\n').strip('\r').strip(' ')
fields[2] = fields[2].strip('\n').strip('\r')
addDocs.append(fields)
for item in addDocs:
if item[1] in doiList:
idDoc = next((x for x in toCheck if x.DOI == item[1]), None)
idDoc.nameVariant = item[2]
idDoc.checks += 'Checked by hand.'
oaWoS.append(idDoc)
elif kons(item[0]) in titles1:
idDoc = next((x for x in toCheck
if x.konsonanten()[4:] == kons(item[0])), None)
idDoc.nameVariant = item[2]
idDoc.checks += 'Checked by hand.'
idDoc.DOI = item[1]
oaWoS.append(idDoc)
else:
dontknow.append(item)
np.savetxt('docsCheckedCantFind.txt', [item for item in dontknow],
delimiter='\t', header = 'Title\tDOI\tAffiliation',
comments = '', fmt="%s")
# Write final results
finalNumber = len(oaWoS)
print 'Overall number of articles in DOAJ-journals: ', len(oaList)
if contactCR in [1, 2]:
print 'Overall number of hybrid articles: ', len(hybrid)
print 'Number of articles in DOAJ-journals where author from relevant \
institution is corresponding author: ', finalNumber
# ----------- 9. Estimate the APCs and print results to file ------------------
# use 1) other average values or 29 add further options for APCs estimation by
# 1) replacing default values, e.g.
# change print output 'Estimated APCs (assume 1285 €)' to $amount
# change default value used for multiplication to 'finalNumber * $amount'
# 2) adding a new line, e.g.
# print 'Estimated APCs (assume 1360 €): ', finalNumber * 1360, ' €\n'
print 'Estimated APCs (assume 1285 €): ', finalNumber * 1285, ' €\n'
print 'Estimated APCs (assume 980 €): ', finalNumber * 980, ' €\n'
np.savetxt('allPubs.txt', [item.arry() for item in finalList], delimiter='\t',
header = ch, comments = '', fmt="%s")
np.savetxt('allOAPubs.txt', [item.arry() for item in oaList], delimiter='\t',
header = ch, comments = '', fmt="%s")
np.savetxt('allOAPubsWithCorrAuthor.txt', [item.arry() for item in oaWoS],
delimiter='\t', header = ch, comments = '', fmt="%s")
if contactCR in [1, 2]:
np.savetxt('hybridArticles.txt', [item.arry() for item in hybrid],
delimiter='\t', header = ch, comments = '', fmt="%s")
# ------------------------- 10. Statistics ------------------------------------
# Figure out how many of each kind of publication for each year, display this
# in a table and save data to file
if doAnalysis == True:
yearsAll = [int(x.year) for x in finalList if x.year != None]
yearsOA = [int(x.year) for x in oaList if x.year != None]
yearsOACorr = [int(x.year) for x in oaWoS if x.year != None]
if contactCR in [1, 2]:
yearsHybrid = [int(x.year) for x in hybrid if x.year != None]
else:
yearsHybrid = []
years = sorted(list(set(yearsAll)))
lenyr = len(years)
pubAll = [None] * lenyr
pubOA = [None] * lenyr
pubHybrid = [None] * lenyr
percOA = [None] * lenyr
pubOACorr = [None] * lenyr
percOACorr = [None] * lenyr
percHybrid = [None] * lenyr
ta = PrettyTable(['year', '# Publications', '# Gold OA',
'# Hybrid', '# OA P. + Corr. Author'])
for i in range(0, lenyr):
pubAll[i] = yearsAll.count(years[i])
pubOA[i] = yearsOA.count(years[i])
pubOACorr[i] = yearsOACorr.count(years[i])
pubHybrid[i] = yearsHybrid.count(years[i])
percOA[i] = round(float(100 * pubOA[i])/float(pubAll[i]),1)
percOACorr[i] = round(float(100 * pubOACorr[i])/float(pubOA[i]),1)
percHybrid[i] = round(float(100 * pubHybrid[i])/float(pubAll[i]),1)
if pubAll[i] > 0:
i1 = str(pubOA[i]) + ' ~ ' +\
str(percOA[i]) + ' %'
i3 = str(pubHybrid[i]) + ' ~ ' +\
str(percHybrid[i]) + ' %'
else:
i1 = pubOA[i]
if pubOA[i] > 0:
i2 = str(pubOACorr[i]) + ' ~ ' +\
str(percOACorr[i]) + ' %'
else:
i2 = pubOACorr[i]
ta.add_row([years[i], pubAll[i], i1, i3, i2])
ta.add_row(['----', '----', '----', '----', '----'])
years.append('Sum')
pubAll.append(sum(pubAll))
pubOA.append(sum(pubOA))
pubHybrid.append(sum(pubHybrid))
percOA.append(round(float(100 * sum(pubOA))/float(sum(pubAll)),1))
pubOACorr.append(sum(pubOACorr))
percOACorr.append(round(float(100 * sum(pubOACorr))/float(sum(pubOA)),1))
percHybrid.append(round(float(100 * sum(pubHybrid))/float(sum(pubAll)),1))
OAStats = map(list,map(None,*[years, pubAll, pubOA, percOA, pubHybrid,
percHybrid, pubOACorr, percOACorr]))
ch = 'year\tNo. Publications\tNo. OA Publications\t% OA Publications\t\
No. Hybrid Publications\t% Hybrid Publications\t\
No. OA Publications + Corr. Author\t% OA Publications with Corr. Auth'
np.savetxt('statistics_OA.txt', OAStats,
delimiter='\t', header = ch, comments = '', fmt="%s")
v1 = str(pubOA[-1]) + ' ~ ' + str(percOA[-1]) + ' %'
v3 = str(pubHybrid[-1]) + ' ~ ' + str(percHybrid[-1]) + ' %'
if sum(pubOA) > 0:
v2 = str(pubOACorr[-1]) + ' ~ ' + str(percOACorr[-1]) + ' %'
else:
v2 = sum(pubOACorr)
ta.add_row(['Sum', pubAll[-1], v1, v3, v2])
print ta
print 'Prozentsätze für Gold OA und hybride Artikel beziehen sich ',\
'jeweils auf die Gesamtzahl aller Arikel. Die Prozentangabe für OA ',\
'Artikel mit Autor von einer relevanten Institution beziehen sich auf ',\
'goldene OA Artikel. ' , str(len(finalList) - pubAll[-1]), 'Artikel ',\
'wurden in diese Auswertung nicht mit einbezogen, da die von der ',\
'Datenbank gelieferten Daten keine Jahresangabe enthalten.'
# Create graphical output
### please note: if you run this script on a Mac the following 3 lines
### have been known to cause trouble (OS X 10.9.5, Python 2.7.11).
### Uncomment if you run into errors
gr.threebar(years[0:-1], pubAll[0:-1], pubOA[0:-1], pubOACorr[0:-1],
pubHybrid[0:-1])
gr.lineplot1(years, percOA, percOACorr)
gr.otherbar(years, pubAll, pubOA, pubOACorr)
# Do statistics for publishers of OA articles and save results to file
if doAnalysis == True:
publishersAll = [x.publisher for x in oaWoS]
pAN = float(len(publishersAll))
publishers = collections.Counter(publishersAll)
pN = len(publishers)
print 'Number of publishers: ', pN
haeuf = publishers.most_common(pN)
publisherStats = [None] * pN
tally = 0.
noPubl = ['x', 'UNKNOWN', 0, 0, 0]
tb = PrettyTable(['Rank', 'Publisher', '# Publications',
'% of Publications', 'Cumulative % of Publications'])
counts = 0
for i in range(0, pN):
if haeuf[i][0] == '':
noPubl[2] += haeuf[i][1]
elif haeuf[i][0] == None:
noPubl[2] += haeuf[i][1]
else:
tally += haeuf[i][1]
publisherStats[i] = [counts + 1, haeuf[i][0], haeuf[i][1],
round(100. * haeuf[i][1]/pAN, 2),
round(100. * tally/pAN, 2)]
counts += 1
if counts < 21:
tb.add_row(publisherStats[i])
tally += noPubl[2]
noPubl[3] = round(100. * noPubl[2]/pAN, 2)
noPubl[4] = round(100. * tally/pAN, 2)
publisherStats.append(noPubl)
publisherStats = [item for item in publisherStats if item != None]
ch = 'Rank\tPublisher\t# Publications\t% Publications\t\
Cumulative % of Publications'
np.savetxt('statistics_publishers.txt', publisherStats,
delimiter='\t', header = ch, comments = '', fmt="%s")
print tb
print time.time() - t