-
Notifications
You must be signed in to change notification settings - Fork 0
/
commit-size-distribution.py
executable file
·241 lines (199 loc) · 6.81 KB
/
commit-size-distribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env python3
import argparse
import hashlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import subprocess
import tempfile
def cachefile_name(repository, after, before):
cmd = ["git", "-C", repository, "rev-parse", "HEAD"]
head_rev = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)
digest = hashlib.sha256()
digest.update(bytes(repository, "UTF-8"))
digest.update(head_rev.stdout)
digest.update(bytes(str(after), "UTF-8"))
digest.update(bytes(str(before), "UTF-8"))
key = digest.hexdigest() + ".csv"
return os.path.join(
tempfile.gettempdir(),
"commit-size-distribution",
os.path.basename(os.path.abspath(repository)),
key)
def git_numstat(repository, after, before, cache=True):
cachefile = cachefile_name(repository, after, before)
if cache and os.path.exists(cachefile):
return pd.read_csv(cachefile, index_col=0, parse_dates=False)
else:
res = uncached_git_numstat(repository, after, before)
if cache:
os.makedirs(os.path.dirname(cachefile), exist_ok=True)
res.to_csv(cachefile)
return res
def uncached_git_numstat(repository, after, before):
cmd = [
"git",
"-C",
repository,
"log",
"--no-merges",
"--format=%H",
"--numstat"
]
if after:
cmd.append("--after", after)
if before:
cmd.append("--before", before)
res = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True)
added = []
removed = []
changed = []
a = r = c = 0
# stdout format is
# <commit>
# <empty>
# <added><blank><removed><blank><file1>
# <added><blank><removed><blank><file2>
# ...
# Total up the stat lines for each commit and record them when we see the
# next commit. This causes a false record at index 0.
for line in res.stdout.splitlines():
if re.fullmatch(b"[a-f0-9]{40}", line):
added.append(a)
removed.append(r)
changed.append(c)
a = r = c = 0
continue
stat = line.split()
if len(stat) == 0:
continue
# Skip binary files
if stat[0] == b"-":
continue
a += int(stat[0])
r += int(stat[1])
c = a + r
return pd.DataFrame({
"added": added[1:],
"removed": removed[1:],
"changed": changed[1:],
})
# Adapted from https://stackoverflow.com/a/43455567/482758
def mark_hours(ax):
"""
Effeciently draws vertical lines at increments of 400,
the middle optimal-inspection-rate, per
https://www.ibm.com/developerworks/rational/library/11-proven-practices-for-peer-review/
:param ax: The x axis
"""
_, x_max = ax.get_xlim()
xs = np.array(range(400, int(x_max), 400), copy=False)
x_points = np.repeat(xs, repeats=3)
y_points = np.repeat(
np.array((0, 1.05, np.nan))[None, :],
repeats=len(xs),
axis=0).flatten()
plt.plot(
x_points,
y_points,
scaley=False,
color="black",
linewidth="0.5")
def main(args):
df = git_numstat(
args.repository,
args.after,
args.before,
args.cache)
plt.style.use("ggplot")
fig, ax = plt.subplots(figsize=(8, 4))
ax.set(
title="Size of non-merge commits",
xlabel="Lines of code",
ylabel="Probability")
ax.yaxis.set_ticks(np.arange(0, 1.1, 0.1))
x_range = None
if args.max_size:
# Although the range kwarg's documentation suggests it represents the
# "max size" it always uses the supplied value, inflating the x_max
# when the specified limit exceeds the max input. Compare the max input
# to prevent that.
x_range = (0, min(args.max_size, df.changed.max()))
ax.hist(
[df.added, df.removed, df.changed],
bins=len(df.changed),
normed=True,
histtype="step",
range=x_range,
color=("green", "red", "blue"),
cumulative=True,
label=("added", "removed", "changed"))
ax.legend(loc="lower right").set_visible(True)
if args.mark_hours:
mark_hours(ax)
if args.plot_outfile:
os.makedirs(os.path.dirname(args.plot_outfile), exist_ok=True)
fig.savefig(
args.plot_outfile,
transparent=False,
bbox_inches="tight")
if args.preview:
plt.show()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""
Plot Git commit patch-size distribution.
Run this command with the path to a Git repository. It will print three
cumulative distribution functions of the number of added, removed, and total
changed lines in every non-merge commit reachable from HEAD.
Either plot-outfile, --preview, or both must be specified.
History analysis takes time linear in the number of commits, which could be a
while, but is cached to speed up successive runs.
""")
parser.add_argument("repository", help="path to the repository to analyse")
parser.add_argument(
"plot_outfile",
metavar="plot-outfile",
nargs="?",
help="""
path to write a PNG plot to. Required if --preview is omitted.""")
parser.add_argument("--after", help="""
forego analysis of commits before this timespec. Passed directly to
git-log.""")
parser.add_argument("--before", help="Opposite of --after")
parser.add_argument(
"--mark-hours",
action="store_true",
help="""
draw vertical lines at increments of 400 to indicate hours
necessary for review, according to SmartBear's Cisco study""")
parser.add_argument(
"--max-size",
type=int,
help="""
threshold above which commit sizes should be considered noise and
be disregarded""")
parser.add_argument(
"--preview",
action="store_true",
help="""
open a preview of the plot in the foreground. Required if
plot-outfile is omitted.""")
# Invert this option, because disabling is more ergonomic on the CLI
# and enabling is more ergonomic in code.
parser.add_argument(
"--no-cache",
dest="cache",
action="store_false",
help="""
Do not cache the repository analysis""")
args = parser.parse_args()
if not args.plot_outfile and not args.preview:
parser.error("plot-outfile or --preview required")
main(args)