This repository has been archived by the owner on Jun 19, 2024. It is now read-only.
forked from pytorch/benchmark
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfx_profile.py
executable file
·237 lines (192 loc) · 7.74 KB
/
fx_profile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env python
from collections import Counter, defaultdict
from functools import partial
from torch.cuda import synchronize
from typing import Any, Dict, Callable, Optional
import argparse
import gc
import logging
import os
import pandas as pd
import re
import time
import warnings
os.environ["FX_PATCH_GETITEM"] = "1" # make BERT fx.symbolic_trace
from torchbenchmark import list_models
from torch.fx import symbolic_trace, Node, GraphModule
from torch.fx.interpreter import Interpreter
import torch
# These do not fx.symbolic_trace()
SKIP = {"attention_is_all_you_need_pytorch", "demucs", "dlrm", "maml",
"yolov3", "tacotron2", "moco", "Super_SloMo"}
class ProfileStats(object):
@staticmethod
def _norm(cnt: Counter):
""" Normalize to unit length """
total = sum(cnt.values())
return Counter({k: v / total for k, v in cnt.items()})
def __init__(self, get_name: Optional[Callable]):
super(ProfileStats, self).__init__()
self.times: Dict[str, float] = Counter()
self.counts: Dict[str, int] = Counter()
self.get_name = get_name
def record(self, node: Node, sec: float):
""" Record timings of a single call """
name = self.get_name(node)
self.times[name] += sec
self.counts[name] += 1
def summary(self, n=5):
most_common = self._norm(self.times).most_common(n - 1)
return " ".join([f"{k}:{v:.0%}" for k, v in most_common] +
[f"other:{1.0 - sum(v for k, v in most_common):.0%}"])
class ProfileAggregate(ProfileStats):
def __init__(self, name: str):
super(ProfileAggregate, self).__init__(None)
self.df = pd.DataFrame()
self.name = name
def update(self, other: ProfileStats, name):
""" Merge stats from a finished benchmark run into this """
nt = self._norm(other.times).most_common(None)
self.times.update(nt)
self.counts.update(self._norm(other.counts))
self.df = self.df.append(pd.DataFrame(
[[t for n, t in nt]],
index=[name],
columns=[n for n, t in nt],
))
def save(self):
df = self.df.fillna(0.0).transpose()
df.insert(0, "AVERAGE", df.mean(axis=1))
df.sort_values("AVERAGE", ascending=False, inplace=True)
df.to_csv(f"{self.name}.csv")
print(f"wrote {self.name}.csv")
PROFILES = [
ProfileAggregate("operators"),
ProfileAggregate("successors1"),
ProfileAggregate("successors2"),
ProfileAggregate("predecessors1"),
ProfileAggregate("predecessors2"),
]
class FXProfiler(Interpreter):
def __init__(self, module: GraphModule):
super(FXProfiler, self).__init__(module)
self.profile_stats = [
ProfileStats(self.get_name),
ProfileStats(partial(self.succ_name, depth=2)),
ProfileStats(partial(self.succ_name, depth=3)),
ProfileStats(partial(self.pred_name, depth=2)),
ProfileStats(partial(self.pred_name, depth=3)),
]
self.successors = defaultdict(list)
self.predecessors = defaultdict(list)
for node in self.module.graph.nodes:
def visit(other_node):
self.successors[other_node].append(node)
self.predecessors[node].append(other_node)
torch.fx.map_arg((node.args, node.kwargs), visit)
def run_node(self, n: Node) -> Any:
""" Timing wrapper around executing an FX Node """
start = time.perf_counter()
result = super().run_node(n)
synchronize()
sec = time.perf_counter() - start
for prof in self.profile_stats:
prof.record(n, sec)
return result
_op_node_to_name = {
"call_function": lambda i, t: t.__name__,
"call_method": lambda i, t: t,
"call_module": lambda i, t: type(i.fetch_attr(t)).__name__,
"get_attr": lambda i, t: "get_attr",
"output": lambda i, t: "output",
"placeholder": lambda i, t: "placeholder",
}
def get_name(self, n: Node) -> Callable:
""" Coverts a Node to a string name """
return self._op_node_to_name[n.op](self, n.target).lower()
def pred_name(self, node: Node, depth: int) -> Callable:
""" A string name that includes names of predecessor nodes """
if depth <= 1:
return self.get_name(node)
pred_str = ','.join(self.pred_name(x, depth - 1) for x in self.predecessors[node])
return f"{self.get_name(node)}({pred_str})"
def succ_name(self, node: Node, depth: int) -> Callable:
""" A string name that includes names of successor nodes """
s = self.successors[node]
if depth <= 1 or len(s) == 0:
return self.get_name(node)
elif len(s) > 1:
succ_str = "MANY"
else:
succ_str = self.succ_name(s[0], depth - 1)
return f"{self.get_name(node)}->{succ_str}"
def profile(device, name, model, example_inputs, args):
model = torch.fx.symbolic_trace(model)
prof = FXProfiler(model)
for _ in range(args.warmup):
model(*example_inputs)
for _ in range(args.repeat):
synchronize()
prof.run(*example_inputs)
for aggregate, stats in zip(PROFILES, prof.profile_stats):
print(f"{device:4} {name:20} {aggregate.name:13} {stats.summary()}")
aggregate.update(stats, name=name)
return model
def short_name(name, limit=20):
""" Truncate a model name to limit chars"""
return name if len(name) <= limit else f"{name[:limit - 3].rstrip('_')}..."
def iter_models(args):
for benchmark_cls in list_models():
if (not re.search("|".join(args.filter), benchmark_cls.name, re.I) or
re.search("|".join(args.exclude), benchmark_cls.name, re.I) or
benchmark_cls.name in SKIP):
continue
try:
benchmark = benchmark_cls(test="eval", device=args.device, jit=False)
model, example_inputs = benchmark.get_module()
model.eval()
gc.collect()
yield short_name(benchmark.name), model, example_inputs
except NotImplementedError:
pass
def noop():
pass
def main(args=None):
parser = argparse.ArgumentParser()
parser.add_argument("--filter", "-k", action="append",
help="filter benchmarks")
parser.add_argument("--exclude", "-x", action="append",
help="filter benchmarks")
parser.add_argument("--device", "-d", help="cpu or cuda")
parser.add_argument("--warmup", type=int, default=1,
help="warmup runs to do")
parser.add_argument("--repeat", "-n", type=int, default=10,
help="number of timing runs")
parser.add_argument("--threads", "-p", type=int,
help="number threads")
parser.add_argument("--cpu-fusion", action="store_true",
help="enable can_fuse_on_cpu")
parser.add_argument("--no-skip", "-a", action="store_true",
help="run models that don't fx cleanly")
args = parser.parse_args(args)
# defaults
args.device = args.device or "cpu"
args.filter = args.filter or [r"."]
args.exclude = args.exclude or [r"^$"]
if args.device == "cpu":
global synchronize
synchronize = noop
if args.no_skip:
SKIP.clear()
if args.cpu_fusion:
torch._C._jit_override_can_fuse_on_cpu(True)
if args.threads:
torch.set_num_threads(args.threads)
for name, model, example_inputs in iter_models(args):
profile(args.device, name, model, example_inputs, args)
for prof in PROFILES:
prof.save()
if __name__ == '__main__':
logging.basicConfig(level=logging.WARNING)
warnings.filterwarnings("ignore")
main()