-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcitations.py
More file actions
174 lines (138 loc) · 6.64 KB
/
citations.py
File metadata and controls
174 lines (138 loc) · 6.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python3
"""
Given an author identified by his/her BAI, this simple Python3 script counts the
number of citations and the number of citations excluding self cites in the
Inspirehep database (https://inspirehep.net/) for each paper in a given collection.
"""
__author__ = 'Edgardo Franzin'
__version__ = '3.2'
__license__ = 'GPL'
__email__ = 'edgardo<dot>franzin<at>gmail<dot>com'
# Parser options
from parser import args
BAI = args.BAI
given_year = args.given_year
latest_years = args.latest_years
collection = args.collection
order = args.order
number_of_authors = args.number_of_authors
# Import datetime to set the current year
from datetime import datetime
current_year = datetime.today().year
# Import functions
from profile import load_profile
from selection import select_collection, select_interval, select_lessauthors, warnings, get_years_range
# Load and select data
data = load_profile(BAI)
if number_of_authors:
data = select_lessauthors(data, number_of_authors)
data = select_collection(data, collection)
if latest_years:
range_years = range(current_year-latest_years+1, current_year+1)
data = select_interval(data, range_years)
if given_year:
range_years = range(given_year, given_year+1)
data = select_interval(data, range_years)
# Warning if data is empty
warning = warnings(data, number_of_authors, latest_years, given_year, collection)
if warning:
print(warning)
exit()
# Year of the first and last hits
first_year, last_year, active_years = get_years_range(data)
# Sorting: default is from most recent
if order:
data = data[::-1]
# Dataclass and arrays to compute the citation metrics for published papers
from dataclasses import dataclass, field
import numpy as np
@dataclass
class Citations:
cits: list = field(default_factory=list)
cits_noself: list = field(default_factory=list)
authors: list = field(default_factory=list)
age: list = field(default_factory=list)
def add_citations(self, cits_count, cits_noself_count, author_count, age_of_publication):
self.cits.append(cits_count)
self.cits_noself.append(cits_noself_count)
self.authors.append(author_count)
self.age.append(age_of_publication)
# Convert internal lists to NumPy arrays
def to_numpy(self, dtype=int):
return {
'cits': np.asarray(self.cits, dtype=dtype),
'cits_noself': np.asarray(self.cits_noself, dtype=dtype),
'authors': np.asarray(self.authors, dtype=dtype),
'age': np.asarray(self.age, dtype=dtype),
}
cits_total = Citations()
cits_citeable = Citations()
cits_published = Citations()
for hit in data:
metadata = hit['metadata']
cits_count = metadata['citation_count']
cits_noself_count = metadata['citation_count_without_self_citations']
author_count = metadata['author_count']
age_of_publication = metadata['age_of_publication']
cits_total.add_citations(cits_count, cits_noself_count, author_count, age_of_publication)
if 'citeable' in metadata:
cits_citeable.add_citations(cits_count, cits_noself_count, author_count, age_of_publication)
if 'refereed' in metadata:
cits_published.add_citations(cits_count, cits_noself_count, author_count, age_of_publication)
# Convert lists to NumPy arrays
cits_total = cits_total.to_numpy()
cits_citeable = cits_citeable.to_numpy()
cits_published = cits_published.to_numpy()
total_hits = cits_total['cits'].size
total_hits_citeable = cits_citeable['cits'].size
total_hits_published = cits_published['cits'].size
# Count the number of citations and citations excluding self cites
citations = {'total': {'total': np.sum(cits_total['cits']), 'noself': np.sum(cits_total['cits_noself'])},
'published': {'total': np.sum(cits_published['cits']), 'noself': np.sum(cits_published['cits_noself'])},
'citeable': {'total': np.sum(cits_citeable['cits']), 'noself': np.sum(cits_citeable['cits_noself'])}}
def bold(text):
return f"\033[1m{text}\033[0m"
def italic(text):
return f"\033[3m{text}\033[0m"
# For each record print the title, the number of citations and the number of citations excluding self cites
for i, hit in enumerate(data):
title = hit['metadata']['titles'][0]['title']
if 'refereed' in hit['metadata']:
title += '*'
print(f"{bold(title)}\
\nNumber of citations: {cits_total['cits'][i]}; Excluding self cites: {cits_total['cits_noself'][i]}")
from summary import count_document_type, print_totals, breakdown_citations
# Number of research works
collection_title = collection if collection != 'all' else 'research works'
print(f'\nNumber of {italic(collection_title)}: {total_hits}, published*: {total_hits_published}, citeable: {total_hits_citeable}')
if number_of_authors:
print(f'Max number of authors: {number_of_authors}')
if collection == 'all':
# Print the breakdown of document types by number; the sum of values can be larger that total_hits
doc_type_counts = count_document_type(data)
for key, value in sorted(doc_type_counts.items()):
print(f' {key}: {value}')
# Print the total number of citations with and without self cites
if total_hits > 1:
print_totals(citations['total'])
if total_hits > 1 and total_hits != total_hits_citeable:
print_totals(citations['citeable'], '(Citeable only)')
if total_hits_published > 1:
print_totals(citations['published'], '(Published only)')
# Compute some citation metrics https://en.wikipedia.org/wiki/Author-level_metrics
# In this case they are computed for the published data
from metrics import compute_metrics
indices = compute_metrics(cits_published, active_years)
format_index = lambda idx: f'{idx:g}' if idx.is_integer() else f'{idx:.2f}'
if total_hits_published > 1:
bibliometrics_last_years = f' (last {latest_years} years)' if latest_years else ''
if given_year:
print(f'\n--Bibliometrics{bibliometrics_last_years}--\nNumber of publications: {total_hits_published}, citeable: {total_hits_citeable}, year: {given_year}')
else:
print(f'\n--Bibliometrics{bibliometrics_last_years}--\nNumber of publications: {total_hits_published}, citeable: {total_hits_citeable}, active years: {active_years} ({first_year}–{last_year})')
print(f"Mean number of citations per paper: {np.mean(cits_published['cits']):0.1f}; Excluding self cites: {np.mean(cits_published['cits_noself']):0.1f}")
for index in indices:
print(f'{index}: {format_index(indices[index][0])}; Excluding self cites: {format_index(indices[index][1])}')
# Breakdown of papers by citations
breakdown = breakdown_citations(cits_citeable['cits'], cits_citeable['cits_noself'], cits_published['cits'], cits_published['cits_noself'])
print(breakdown) if breakdown else None